Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions gen/runed/v1/runed.pb.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

75 changes: 73 additions & 2 deletions internal/backend/llama.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,43 @@ type Config struct {
CtxSize int // default 2048; --ctx-size in tokens = max input length (llama-server rejects longer input with HTTP 400)
}

// ServingState is the backend's current ability to serve embeddings, as
// reported through the daemon's Health RPC.
type ServingState int

const (
// ServingOK: the llama-server child is up and answering /health.
ServingOK ServingState = iota
// ServingIdle: the child was intentionally stopped after an idle period
// to free model memory. The next Embed RPC resurrects it (a one-off
// cold-start latency). This is expected, not a fault.
ServingIdle
// ServingDegraded: the child is up but failing /health, or it exited
// unexpectedly. A genuine problem.
ServingDegraded
)

// childState records why the llama-server child is not currently running,
// so Serving can tell an intentional idle-suspend apart from a crash.
type childState int

const (
childRunning childState = iota // child up (or starting)
childSuspended // intentionally stopped (idle suspend / restart)
childFailed // exited unexpectedly
)

type LlamaBackend struct {
cfg Config

cmd *exec.Cmd
port int
cmdDone chan struct{} // closed by watchChild when current cmd.Wait returns
stopping bool // true while stopLocked is intentionally terminating the child
mu sync.Mutex // protects cmd, port, cmdDone, stopping — short critical sections only
// state records why the child is down (childSuspended vs childFailed) so
// Serving maps it to ServingIdle vs ServingDegraded. Protected by mu.
state childState
mu sync.Mutex // protects cmd, port, cmdDone, stopping, state — short critical sections only

// lifecycleMu serializes Start / Stop / EnsureStarted. RPC handlers
// call EnsureStarted on every request so the contention here must
Expand Down Expand Up @@ -166,6 +195,7 @@ func (b *LlamaBackend) startLocked(ctx context.Context) error {
b.cmd = cmd
b.port = 0
b.cmdDone = done
b.state = childRunning
b.mu.Unlock()
go b.watchChild(cmd, done)

Expand Down Expand Up @@ -263,6 +293,11 @@ func (b *LlamaBackend) watchChild(cmd *exec.Cmd, done chan struct{}) {
if b.cmd == cmd {
b.cmd = nil
b.port = 0
// Any exit leaves the child down; default to failed so a crash or a
// failed (re)start reports DEGRADED. Stop() (the idle-suspend path)
// overrides to childSuspended afterward — it waits on done, so its
// write lands strictly after this one.
b.state = childFailed
}
b.mu.Unlock()
// Log before close(done) so readers of done can rely on the log
Expand Down Expand Up @@ -302,6 +337,27 @@ func (b *LlamaBackend) IsHealthy(ctx context.Context) bool {
return resp.StatusCode == 200
}

// Serving reports whether the backend can currently serve embeddings, and if
// not, why. Health maps the result onto the proto Status enum:
// ServingOK→OK, ServingIdle→IDLE (intentional idle-suspend, resumes on the
// next RPC), ServingDegraded→DEGRADED (process up-but-unhealthy, or crashed).
func (b *LlamaBackend) Serving(ctx context.Context) ServingState {
b.mu.Lock()
haveCmd := b.cmd != nil
st := b.state
b.mu.Unlock()
if haveCmd {
if b.IsHealthy(ctx) {
return ServingOK
}
return ServingDegraded
}
if st == childSuspended {
return ServingIdle
}
return ServingDegraded
}

// Stop terminates the llama-server child if running. After Stop returns,
// cmd is nil and port is 0 — a subsequent EnsureStarted re-launches the
// child cleanly.
Expand All @@ -316,7 +372,22 @@ func (b *LlamaBackend) Stop(ctx context.Context) error {
defer b.inflightMu.Unlock()
b.lifecycleMu.Lock()
defer b.lifecycleMu.Unlock()
return b.stopLocked(ctx)
b.mu.Lock()
wasRunning := b.cmd != nil
b.mu.Unlock()
err := b.stopLocked(ctx)
// Stop is the intentional idle-suspend path (the idle ticker). Mark the
// child suspended so Serving reports IDLE, not DEGRADED — but only if we
// actually stopped a running child, so a Stop on an already-crashed
// backend doesn't mask the failure. stopLocked waits on done, so
// watchChild's childFailed write has already landed; this override runs
// after it.
if wasRunning {
b.mu.Lock()
b.state = childSuspended
b.mu.Unlock()
}
return err
}

func (b *LlamaBackend) stopLocked(ctx context.Context) error {
Expand Down
121 changes: 121 additions & 0 deletions internal/backend/serving_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
//go:build !windows

package backend

import (
"context"
"net/http"
"net/http/httptest"
"os/exec"
"testing"
"time"
)

// TestServing_DistinguishesIdleFromDegraded checks the Serving mapping given a
// known child state: an intentional idle-suspend is ServingIdle (not a fault)
// while a crash or an up-but-unhealthy child is ServingDegraded. This is what
// lets Health surface STATUS_IDLE instead of misleading users with
// STATUS_DEGRADED after the idle ticker stops llama-server to free memory.
func TestServing_DistinguishesIdleFromDegraded(t *testing.T) {
t.Run("child down after intentional suspend → idle", func(t *testing.T) {
b := NewLlamaBackend(Config{})
b.mu.Lock()
b.cmd = nil
b.state = childSuspended
b.mu.Unlock()
if got := b.Serving(context.Background()); got != ServingIdle {
t.Fatalf("Serving() = %v, want ServingIdle", got)
}
})

t.Run("child down after unexpected exit → degraded", func(t *testing.T) {
b := NewLlamaBackend(Config{})
b.mu.Lock()
b.cmd = nil
b.state = childFailed
b.mu.Unlock()
if got := b.Serving(context.Background()); got != ServingDegraded {
t.Fatalf("Serving() = %v, want ServingDegraded", got)
}
})

t.Run("child up and healthy → ok", func(t *testing.T) {
fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer fake.Close()
host, port := splitHostPort(t, fake.URL)
b := NewLlamaBackend(Config{Host: host})
b.mu.Lock()
b.cmd = &exec.Cmd{} // non-nil → child considered up
b.port = port
b.state = childRunning
b.mu.Unlock()
if got := b.Serving(context.Background()); got != ServingOK {
t.Fatalf("Serving() = %v, want ServingOK", got)
}
})

t.Run("child up but unhealthy → degraded", func(t *testing.T) {
fake := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusServiceUnavailable)
}))
defer fake.Close()
host, port := splitHostPort(t, fake.URL)
b := NewLlamaBackend(Config{Host: host})
b.mu.Lock()
b.cmd = &exec.Cmd{} // non-nil → child considered up
b.port = port
b.state = childRunning
b.mu.Unlock()
if got := b.Serving(context.Background()); got != ServingDegraded {
t.Fatalf("Serving() = %v, want ServingDegraded", got)
}
})
}

// TestServing_LifecycleTransitions drives the real child lifecycle (via a
// placeholder `sleep` process) to verify which teardown paths produce IDLE vs
// DEGRADED. The key regression: only Stop() (the idle ticker) yields IDLE — an
// internal stopLocked from a failed (re)start, or a crash, must stay DEGRADED.
func TestServing_LifecycleTransitions(t *testing.T) {
t.Run("Stop (idle-suspend) → idle", func(t *testing.T) {
b := NewLlamaBackend(Config{})
cmd := attachPlaceholderChild(t, b, 1)
defer cleanupPlaceholderChild(cmd)
if err := b.Stop(context.Background()); err != nil {
t.Fatalf("Stop: %v", err)
}
if got := b.Serving(context.Background()); got != ServingIdle {
t.Fatalf("after Stop: got %v, want ServingIdle", got)
}
})

t.Run("internal stopLocked (failed start/restart) → degraded, not idle", func(t *testing.T) {
b := NewLlamaBackend(Config{})
cmd := attachPlaceholderChild(t, b, 1)
defer cleanupPlaceholderChild(cmd)
// startLocked / EnsureStarted stop the child via stopLocked (not Stop)
// when a (re)start fails. That must NOT be reported as an idle-suspend.
b.lifecycleMu.Lock()
_ = b.stopLocked(context.Background())
b.lifecycleMu.Unlock()
if got := b.Serving(context.Background()); got != ServingDegraded {
t.Fatalf("internal stop should be DEGRADED, got %v", got)
}
})

t.Run("unexpected exit (crash) → degraded", func(t *testing.T) {
b := NewLlamaBackend(Config{})
cmd := attachPlaceholderChild(t, b, 1)
defer cleanupPlaceholderChild(cmd)
_ = cmd.Process.Kill() // crash: no Stop/stopLocked involved
deadline := time.Now().Add(2 * time.Second)
for b.getCmd() != nil && time.Now().Before(deadline) {
time.Sleep(10 * time.Millisecond)
}
if got := b.Serving(context.Background()); got != ServingDegraded {
t.Fatalf("after crash: got %v, want ServingDegraded", got)
}
})
}
12 changes: 8 additions & 4 deletions internal/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ func (s *Server) Info(ctx context.Context, _ *runedv1.InfoRequest) (*runedv1.Inf
}

// Health maps backend state to the proto Status enum. SHUTTING_DOWN
// outranks LOADING/DEGRADED/OK so a drain-in-progress daemon doesn't
// outranks LOADING/IDLE/DEGRADED/OK so a drain-in-progress daemon doesn't
// advertise itself as ready (the GracefulStop race would otherwise
// surface Unavailable right after the OK response).
//
Expand Down Expand Up @@ -278,11 +278,15 @@ func (s *Server) Health(ctx context.Context, _ *runedv1.HealthRequest) (*runedv1
}
return resp, nil
}
if !b.IsHealthy(ctx) {
switch b.Serving(ctx) {
case backend.ServingIdle:
resp.Status = runedv1.HealthResponse_STATUS_IDLE
resp.Message = "embedder suspended after idle to free memory; the next request resumes it automatically"
case backend.ServingDegraded:
resp.Status = runedv1.HealthResponse_STATUS_DEGRADED
return resp, nil
default:
resp.Status = runedv1.HealthResponse_STATUS_OK
}
resp.Status = runedv1.HealthResponse_STATUS_OK
return resp, nil
}

Expand Down
1 change: 1 addition & 0 deletions proto/runed/v1/runed.proto
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ message HealthResponse {
STATUS_LOADING = 2; // bootstrap: fetching llama-server, downloading embedding model
STATUS_DEGRADED = 3;
STATUS_SHUTTING_DOWN = 4;
STATUS_IDLE = 5; // backend intentionally suspended after idle to free memory; resumes on next request
}

// STATUS_LOADING sub-phase
Expand Down
Loading