Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
b8cc8d0
Add ClaudeSession: bidirectional stream-json subprocess
FidoCanCode Apr 14, 2026
bb0028c
Add ClaudeSession.switch_model via /model slash command
FidoCanCode Apr 14, 2026
2a7ba4f
Add ClaudeSession.consume_until_result
FidoCanCode Apr 14, 2026
e9fecb6
Wire ClaudeSession lifecycle into Worker.run
FidoCanCode Apr 14, 2026
e36d899
Supervise ClaudeSession: detect death and restart without losing task
FidoCanCode Apr 14, 2026
8022d07
Extract idle timeout poll interval and log truncation into constants
FidoCanCode Apr 14, 2026
d8adea6
Log exceptions in stop() cleanup instead of swallowing silently
FidoCanCode Apr 14, 2026
6a520be
Remove JSONDecodeError handler and let exception propagate
FidoCanCode Apr 14, 2026
7fa2635
Add threading lock and context manager to ClaudeSession
FidoCanCode Apr 14, 2026
90af85f
Expose lock-break path so preempt/interrupt can bypass the session lock
FidoCanCode Apr 14, 2026
dd418ff
Implement preempt via control_request interrupt with queued follow-up…
FidoCanCode Apr 14, 2026
abc06ad
Route claude_start (setup phase) through the persistent ClaudeSession
FidoCanCode Apr 14, 2026
3b39118
make interrupt cancel the in-flight turn so the lock holder exits
FidoCanCode Apr 14, 2026
4ef93f0
make interrupt kill the in-flight turn, then acquire the lock before …
FidoCanCode Apr 14, 2026
2e5bd31
Route execute_task and nudge loop through the persistent ClaudeSession
FidoCanCode Apr 14, 2026
1bf4d89
Route handle_threads and handle_ci claude_run calls through the persi…
FidoCanCode Apr 14, 2026
c482dcd
Remove setup_session_id state and dead --resume plumbing now supersed…
FidoCanCode Apr 14, 2026
98146ca
Switch models per phase: Opus for planning/triage, Sonnet for impleme…
FidoCanCode Apr 14, 2026
87aef81
preserve ClaudeSession across worker death so webhook-driven interrup…
FidoCanCode Apr 14, 2026
45f3367
persist preempt intent to durable queue so interrupts survive re-pree…
FidoCanCode Apr 14, 2026
10f1f0d
reuse JsonFileStore for preempt queue via PreemptQueue in state.py
FidoCanCode Apr 14, 2026
02b8550
remove unnecessary default model setting
FidoCanCode Apr 14, 2026
09ea998
replace ClaudeStreamError(-1) magic number with a named constant or s…
FidoCanCode Apr 14, 2026
1fbf956
replace repeated truncation slicing with a helper that wraps log calls
FidoCanCode Apr 14, 2026
492b27d
use context manager for lock acquire/release in interrupt and preempt
FidoCanCode Apr 14, 2026
37adbf5
surface errors instead of swallowing them silently
FidoCanCode Apr 14, 2026
a4001dc
Reraise the exception
FidoCanCode Apr 14, 2026
407efb5
reraise exceptions in restart cleanup instead of swallowing
FidoCanCode Apr 14, 2026
d26e3fd
reraise exceptions in restart() instead of swallowing them
FidoCanCode Apr 14, 2026
fac9838
skip kill in restart() when process already dead
FidoCanCode Apr 14, 2026
e0b2351
kill and safe restart on idle timeout in iter_events()
FidoCanCode Apr 14, 2026
6c2d8dd
Bound context growth: restart ClaudeSession on issue boundary
FidoCanCode Apr 14, 2026
3de3118
surface current ClaudeSession owner thread in kennel status line
FidoCanCode Apr 14, 2026
affac1c
clarify ClaudeSession persists across worker death but not kennel/hom…
FidoCanCode Apr 14, 2026
6aad683
fix lost-interrupt race: move _cancel.clear() from __enter__ to iter_…
FidoCanCode Apr 14, 2026
bb6dd41
fix session teardown: preserve session on crash, rescue it on restart
FidoCanCode Apr 14, 2026
d903e8c
wrap stop() body in outer try/finally to always unregister child
FidoCanCode Apr 14, 2026
5d1cba4
send stream-json control_request interrupt in ClaudeSession.interrupt
FidoCanCode Apr 14, 2026
36693c6
drop PreemptQueue — no production path uses it end-to-end
FidoCanCode Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ Multi-repo: one kennel process handles multiple repos. Each repo has its own tas

**Concurrency model**: one fido per repo, one issue per fido, one PR per issue. Fido finishes the current issue (PR merged or closed) before picking up the next. Two repos = two fidos max, running in parallel, each on their own issue.

**ClaudeSession persistence**: the persistent `ClaudeSession` (bidirectional stream-json subprocess) is held on `WorkerThread._session` and survives individual `Worker` crashes — the watchdog restarts the thread and the next `Worker` inherits the same session. It does *not* survive a kennel/home restart: `os.execvp` replaces the process entirely, so the new kennel starts with `_session = None` and creates a fresh session on its first iteration.

## Runner vs workspace clones

Kennel runs from a dedicated **runner clone** at `/home/rhencke/home-runner/`, separate from the **workspace clone** at `/home/rhencke/workspace/home/`.
Expand Down
350 changes: 340 additions & 10 deletions kennel/claude.py

Large diffs are not rendered by default.

42 changes: 37 additions & 5 deletions kennel/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ class WorkerRegistry:
registry.stop_all() # clean shutdown
"""

def __init__(self, thread_factory: Callable[[RepoConfig], WorkerThread]) -> None:
def __init__(self, thread_factory: Callable[..., WorkerThread]) -> None:
self._threads: dict[str, WorkerThread] = {}
self._factory = thread_factory
self._activities: dict[str, WorkerActivity] = {}
Expand All @@ -83,8 +83,24 @@ def __init__(self, thread_factory: Callable[[RepoConfig], WorkerThread]) -> None
self._webhook_lock = threading.Lock()

def start(self, repo_cfg: RepoConfig) -> None:
"""Create and start a WorkerThread for *repo_cfg*."""
thread = self._factory(repo_cfg)
"""Create and start a WorkerThread for *repo_cfg*.

If a previous thread for this repo crashed (dead but not stopped
orderly), its live session is rescued and handed to the replacement so
the persistent :class:`~kennel.claude.ClaudeSession` survives the crash.
"""
session = None
session_issue = None
old_thread = self._threads.get(repo_cfg.name)
if (
old_thread is not None
and not old_thread.is_alive()
and not old_thread._stop
):
# Crashed thread — rescue the live session before replacing it
session, old_thread._session = old_thread._session, None
session_issue, old_thread._session_issue = old_thread._session_issue, None
thread = self._factory(repo_cfg, session=session, session_issue=session_issue)
self._threads[repo_cfg.name] = thread
with self._started_at_lock:
self._started_at[repo_cfg.name] = _utcnow()
Expand Down Expand Up @@ -248,12 +264,24 @@ def get_thread_crash_error(self, repo_name: str) -> str | None:
thread = self._threads.get(repo_name)
return thread.crash_error if thread is not None else None

def get_session_owner(self, repo_name: str) -> str | None:
"""Return the name of the thread currently holding the ClaudeSession lock.

Delegates to :attr:`~kennel.worker.WorkerThread.session_owner` on the
registered thread. Returns ``None`` when no thread is registered for
the repo, no session exists, or the lock is currently free.
"""
thread = self._threads.get(repo_name)
return thread.session_owner if thread is not None else None


def _make_thread(
repo_cfg: RepoConfig,
registry: WorkerRegistry,
*,
gh: GitHub,
session=None,
session_issue=None,
_WorkerThread=WorkerThread,
) -> WorkerThread:
"""Default factory: create a WorkerThread with the provided GitHub client."""
Expand All @@ -263,6 +291,8 @@ def _make_thread(
gh,
registry,
repo_cfg.membership,
session=session,
session_issue=session_issue,
)


Expand All @@ -279,8 +309,10 @@ def make_registry(
(with a mock factory) in tests instead of calling this.
"""

def factory(cfg: RepoConfig) -> WorkerThread:
return _thread_factory(cfg, registry, gh=gh)
def factory(cfg: RepoConfig, *, session=None, session_issue=None) -> WorkerThread:
return _thread_factory(
cfg, registry, gh=gh, session=session, session_issue=session_issue
)

registry = WorkerRegistry(factory)
for repo_cfg in repos.values():
Expand Down
1 change: 1 addition & 0 deletions kennel/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,7 @@ def do_GET(self) -> None:
),
"worker_uptime_seconds": worker_uptime,
"webhook_activities": webhooks,
"session_owner": self.registry.get_session_owner(a.repo_name),
}
)
body = json.dumps(activities).encode()
Expand Down
13 changes: 12 additions & 1 deletion kennel/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class RepoStatus:
task_total: int | None = None
worker_uptime: int | None = None
webhook_activities: list[WebhookActivityInfo] = field(default_factory=list)
session_owner: str | None = None


@dataclass
Expand Down Expand Up @@ -196,6 +197,7 @@ def _fetch_activities(
"is_stuck": item.get("is_stuck", False),
"worker_uptime_seconds": item.get("worker_uptime_seconds"),
"webhook_activities": item.get("webhook_activities", []),
"session_owner": item.get("session_owner"),
}
for item in data
if "repo_name" in item and "what" in item
Expand Down Expand Up @@ -317,6 +319,7 @@ def repo_status(
worker_stuck: bool = False,
worker_uptime: int | None = None,
webhook_activities: list[WebhookActivityInfo] | None = None,
session_owner: str | None = None,
) -> RepoStatus:
"""Collect status for a single repo."""
webhook_activities = list(webhook_activities or [])
Expand All @@ -343,6 +346,7 @@ def repo_status(
last_crash_error=last_crash_error,
worker_stuck=worker_stuck,
webhook_activities=webhook_activities,
session_owner=session_owner,
)

fido_dir = git_dir / "fido"
Expand Down Expand Up @@ -388,6 +392,7 @@ def repo_status(
last_crash_error=last_crash_error,
worker_stuck=worker_stuck,
webhook_activities=webhook_activities,
session_owner=session_owner,
)


Expand Down Expand Up @@ -427,6 +432,7 @@ def collect() -> KennelStatus:
worker_stuck=info["is_stuck"] if info else False,
worker_uptime=worker_uptime_val,
webhook_activities=webhook_list,
session_owner=info.get("session_owner") if info else None,
)
)
return KennelStatus(kennel_pid=pid, kennel_uptime=uptime, repos=repos)
Expand Down Expand Up @@ -497,8 +503,13 @@ def _format_repo_body(repo: RepoStatus) -> list[str]:

if repo.claude_pid is not None:
claude_str = f" └─ claude pid {repo.claude_pid}"
parts: list[str] = []
if repo.claude_uptime is not None:
claude_str += f" (running {_format_uptime(repo.claude_uptime)})"
parts.append(f"running {_format_uptime(repo.claude_uptime)}")
if repo.session_owner is not None:
parts.append(f"held by {repo.session_owner}")
if parts:
claude_str += f" ({', '.join(parts)})"
body.append(claude_str)

for w in repo.webhook_activities:
Expand Down
Loading
Loading