From cabd4107fbf50e1af161f3821ddaaaf228018c9d Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Tue, 28 Apr 2026 00:41:20 +0300
Subject: [PATCH 1/4] test: add deepchecks regression harness

---
 pyproject.toml                            |   4 +
 scripts/run_tests.sh                      |  25 ++++-
 tests/regression/__init__.py              |   1 +
 tests/regression/_stale_index_fixture.py  | 112 ++++++++++++++++++++++
 tests/regression/test_drift_detection.py  |  45 +++++++++
 tests/regression/test_fts5_determinism.sh |  50 ++++++++++
 tests/test_run_tests_script.py            |  34 +++++++
 7 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 tests/regression/__init__.py
 create mode 100644 tests/regression/_stale_index_fixture.py
 create mode 100644 tests/regression/test_drift_detection.py
 create mode 100755 tests/regression/test_fts5_determinism.sh

diff --git a/pyproject.toml b/pyproject.toml
index 6c42eac7..3596e450 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -77,9 +77,13 @@ telemetry = [
     "axiom-py>=0.10.0",  # Axiom observability (optional — degrades gracefully)
 ]
 dev = [
+    "deepchecks>=0.19.1",
+    "numpy<2",
     "pytest>=7.0.0",
     "pytest-asyncio>=0.21.0",
     "ruff>=0.1.0",
+    "scikit-learn<1.6",
+    "setuptools>=70.0.0,<81",
 ]
 docs = [
     "mkdocs-material>=9.5.0",
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 68c044d3..7816a195 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -33,9 +33,17 @@ collect_bun_tests() {
   find "$TEST_ROOT" -type f -name "*.test.ts" | sort
 }
 
+collect_regression_shell_tests() {
+  if [ ! -d "$TEST_ROOT" ]; then
+    return 0
+  fi
+
+  find "$TEST_ROOT" -type f -path "*/regression/*.sh" | sort
+}
+
 run_pytest() {
   if [ "$BRAINLAYER_USE_UV" = "1" ] && command -v uv >/dev/null 2>&1; then
-    uv run pytest "$@"
+    uv run --extra dev pytest "$@"
   else
     pytest "$@"
   fi
@@ -67,6 +75,21 @@ else
   echo
 fi
 
+shell_tests=()
+while IFS= read -r test_file; do
+  shell_tests+=("$test_file")
+done < <(collect_regression_shell_tests)
+
+if [ "${#shell_tests[@]}" -gt 0 ]; then
+  for shell_test in "${shell_tests[@]}"; do
+    run_step "regression shell $(basename "$shell_test")" bash "$shell_test"
+  done
+else
+  echo "==> regression shell suite"
+  echo "SKIP: no regression shell scripts found under $TEST_ROOT"
+  echo
+fi
+
 if [ "$exit_status" -ne 0 ]; then
   echo "BrainLayer test gate failed."
 else
diff --git a/tests/regression/__init__.py b/tests/regression/__init__.py
new file mode 100644
index 00000000..2a24e636
--- /dev/null
+++ b/tests/regression/__init__.py
@@ -0,0 +1 @@
+"""Regression harness tests for ecosystem-level fixtures."""
diff --git a/tests/regression/_stale_index_fixture.py b/tests/regression/_stale_index_fixture.py
new file mode 100644
index 00000000..c501681f
--- /dev/null
+++ b/tests/regression/_stale_index_fixture.py
@@ -0,0 +1,112 @@
+"""Helpers for the stale index regression fixture."""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from typing import Any
+
+from brainlayer.embeddings import get_embedding_model
+
+FIXTURE_PATH = Path(__file__).resolve().parent.parent / "fixtures" / "stale_index_query.json"
+
+
+def load_fixture() -> dict[str, Any]:
+    """Load the seeded stale-index regression fixture."""
+    return json.loads(FIXTURE_PATH.read_text())
+
+
+def cosine_similarity(left: list[float], right: list[float]) -> float:
+    """Compute cosine similarity without adding another numeric dependency."""
+    if len(left) != len(right):
+        raise ValueError(f"embedding length mismatch: {len(left)} != {len(right)}")
+
+    dot_product = 0.0
+    left_norm = 0.0
+    right_norm = 0.0
+    for left_value, right_value in zip(left, right):
+        dot_product += left_value * right_value
+        left_norm += left_value * left_value
+        right_norm += right_value * right_value
+
+    return dot_product / ((left_norm**0.5) * (right_norm**0.5))
+
+
+def create_fixture_db(db_path: Path) -> None:
+    """Seed a temporary SQLite FTS table from the fixture chunks."""
+    fixture = load_fixture()
+    connection = sqlite3.connect(db_path)
+    try:
+        connection.execute(
+            """
+            CREATE VIRTUAL TABLE chunks_fts USING fts5(
+              content,
+              summary,
+              tags,
+              resolved_query,
+              key_facts,
+              resolved_queries,
+              chunk_id UNINDEXED
+            );
+            """
+        )
+        insert_sql = """
+            INSERT INTO chunks_fts(
+              content, summary, tags, resolved_query, key_facts, resolved_queries, chunk_id
+            ) VALUES (?, ?, ?, ?, ?, ?, ?)
+        """
+        for chunk in fixture["chunks"]:
+            connection.execute(
+                insert_sql,
+                (
+                    chunk["content"],
+                    chunk.get("summary"),
+                    json.dumps(chunk["tags"]) if chunk.get("tags") else None,
+                    chunk.get("resolved_query"),
+                    json.dumps(chunk["key_facts"]) if chunk.get("key_facts") else None,
+                    json.dumps(chunk["resolved_queries"]) if chunk.get("resolved_queries") else None,
+                    chunk["id"],
+                ),
+            )
+        connection.commit()
+    finally:
+        connection.close()
+
+
+def current_embedding_rows() -> list[list[float]]:
+    """Re-embed the fixture corpus with the current model."""
+    fixture = load_fixture()
+    model = get_embedding_model()
+    encoder = model._load_model()
+    chunk_embeddings = encoder.encode(
+        [chunk["content"] for chunk in fixture["chunks"]],
+        convert_to_numpy=True,
+        show_progress_bar=False,
+    ).tolist()
+    sample_embedding = model.embed_query(fixture["sample_text"]["text"])
+    return [[float(value) for value in row] for row in chunk_embeddings] + [sample_embedding]
+
+
+def baseline_embedding_rows() -> list[list[float]]:
+    """Return the baseline embedding matrix stored in the fixture."""
+    fixture = load_fixture()
+    chunk_embeddings = [[float(value) for value in chunk["embedding"]] for chunk in fixture["chunks"]]
+    sample_embedding = [float(value) for value in fixture["sample_text"]["baseline_embedding"]]
+    return chunk_embeddings + [sample_embedding]
+
+
+def write_expected_ranking_json(output_path: Path) -> None:
+    """Write the baseline FTS rows to a normalized JSON file."""
+    fixture = load_fixture()
+    output_path.write_text(json.dumps(fixture["query"]["baseline_rows"], indent=2, sort_keys=True) + "\n")
+
+
+def create_temp_fixture_db() -> Path:
+    """Create a temporary seeded fixture DB and return its path."""
+    temp_file = NamedTemporaryFile(prefix="brainlayer-stale-index-", suffix=".db", delete=False)
+    temp_path = Path(temp_file.name)
+    temp_file.close()
+    create_fixture_db(temp_path)
+    return temp_path
diff --git a/tests/regression/test_drift_detection.py b/tests/regression/test_drift_detection.py
new file mode 100644
index 00000000..9c3e14e2
--- /dev/null
+++ b/tests/regression/test_drift_detection.py
@@ -0,0 +1,45 @@
+"""Deepchecks regression for stale-index embedding drift."""
+
+from __future__ import annotations
+
+import pandas as pd
+from deepchecks.tabular import Dataset
+from deepchecks.tabular.checks import FeatureDrift
+
+from tests.regression._stale_index_fixture import (
+    baseline_embedding_rows,
+    cosine_similarity,
+    current_embedding_rows,
+    load_fixture,
+)
+
+
+def _embedding_frame(rows: list[list[float]]) -> pd.DataFrame:
+    if not rows:
+        raise ValueError("embedding fixture rows must be non-empty")
+    width = len(rows[0])
+    columns = [f"dim_{index:04d}" for index in range(width)]
+    return pd.DataFrame(rows, columns=columns)
+
+
+def test_fixture_embeddings_pass_deepchecks_and_cosine_threshold() -> None:
+    fixture = load_fixture()
+    baseline_rows = baseline_embedding_rows()
+    current_rows = current_embedding_rows()
+    min_cosine_similarity = fixture["sample_text"]["min_cosine_similarity"]
+
+    assert len(baseline_rows) == len(current_rows)
+    for baseline_row, current_row in zip(baseline_rows, current_rows):
+        assert cosine_similarity(current_row, baseline_row) > min_cosine_similarity
+
+    baseline_frame = _embedding_frame(baseline_rows)
+    current_frame = _embedding_frame(current_rows)
+    drift_check = FeatureDrift(min_samples=len(baseline_rows))
+    drift_check.add_condition_drift_score_less_than(max_allowed_numeric_score=0.001)
+    result = drift_check.run(
+        train_dataset=Dataset(baseline_frame, cat_features=[]),
+        test_dataset=Dataset(current_frame, cat_features=[]),
+        with_display=False,
+    )
+
+    assert result.passed_conditions()
diff --git a/tests/regression/test_fts5_determinism.sh b/tests/regression/test_fts5_determinism.sh
new file mode 100755
index 00000000..3d2fa3b5
--- /dev/null
+++ b/tests/regression/test_fts5_determinism.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
+TMP_DIR="$(mktemp -d "${TMPDIR:-/tmp}/brainlayer-fts5-determinism.XXXXXX")"
+DB_PATH="$TMP_DIR/stale-index.db"
+EXPECTED_PATH="$TMP_DIR/expected.json"
+ACTUAL_RAW_PATH="$TMP_DIR/actual.raw.json"
+ACTUAL_PATH="$TMP_DIR/actual.json"
+
+cleanup() {
+  rm -rf "$TMP_DIR"
+}
+trap cleanup EXIT
+
+cd "$ROOT_DIR"
+
+uv run python3 - <<'PY' "$DB_PATH" "$EXPECTED_PATH"
+from pathlib import Path
+import sys
+
+from tests.regression._stale_index_fixture import create_fixture_db, write_expected_ranking_json
+
+db_path = Path(sys.argv[1])
+expected_path = Path(sys.argv[2])
+create_fixture_db(db_path)
+write_expected_ranking_json(expected_path)
+PY
+
+QUERY_SQL="$(uv run python3 - <<'PY'
+from tests.regression._stale_index_fixture import load_fixture
+
+print(load_fixture()["sqlite_snapshot"]["query_sql"])
+PY
+)"
+
+uvx --from sqlite-utils sqlite-utils query "$DB_PATH" "$QUERY_SQL" > "$ACTUAL_RAW_PATH"
+
+uv run python3 - <<'PY' "$ACTUAL_RAW_PATH" "$ACTUAL_PATH"
+import json
+from pathlib import Path
+import sys
+
+raw_path = Path(sys.argv[1])
+output_path = Path(sys.argv[2])
+output_path.write_text(json.dumps(json.loads(raw_path.read_text()), indent=2, sort_keys=True) + "\n")
+PY
+
+diff -u "$EXPECTED_PATH" "$ACTUAL_PATH"
diff --git a/tests/test_run_tests_script.py b/tests/test_run_tests_script.py
index 49c0182e..2640cf40 100644
--- a/tests/test_run_tests_script.py
+++ b/tests/test_run_tests_script.py
@@ -87,3 +87,37 @@ def test_run_tests_skips_bun_when_no_typescript_tests_exist(tmp_path: Path) -> N
     assert result.returncode == 0
     assert pytest_log.read_text().strip()
     assert not bun_log.exists()
+
+
+def test_run_tests_executes_regression_shell_scripts(tmp_path: Path) -> None:
+    test_root = tmp_path / "tests"
+    regression_root = test_root / "regression"
+    regression_root.mkdir(parents=True)
+    (test_root / "fixture.test.ts").write_text("test placeholder\n")
+
+    pytest_log, bun_log = _make_stub_bin(tmp_path, pytest_exit=0, bun_exit=0)
+    shell_log = tmp_path / "shell.log"
+    _write_executable(
+        regression_root / "test_fixture.sh",
+        "\n".join(
+            [
+                "#!/usr/bin/env bash",
+                'echo "ran" >> "$SHELL_LOG"',
+                "exit 0",
+                "",
+            ]
+        ),
+    )
+
+    env = os.environ.copy()
+    env["PATH"] = f"{tmp_path / 'bin'}:{env['PATH']}"
+    env["BRAINLAYER_TEST_ROOT"] = str(test_root)
+    env["BRAINLAYER_USE_UV"] = "0"
+    env["PYTEST_LOG"] = str(pytest_log)
+    env["BUN_LOG"] = str(bun_log)
+    env["SHELL_LOG"] = str(shell_log)
+
+    result = subprocess.run(["bash", str(SCRIPT_PATH)], capture_output=True, text=True, env=env)
+
+    assert result.returncode == 0
+    assert shell_log.read_text().strip() == "ran"

From f37b17d5e0ce8e5b683bc4c2c824a44fb1c392ab Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Tue, 28 Apr 2026 00:53:50 +0300
Subject: [PATCH 2/4] test: isolate live and eval harness lanes

---
 .github/workflows/ci.yml | 14 +++++++++++++-
 pyproject.toml           |  1 +
 scripts/run_tests.sh     | 42 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1e90d644..efd1e9d7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,7 +30,19 @@ jobs:
         run: pip install -e ".[dev]"
 
       - name: Unit tests
-        run: pytest tests/ -v --tb=short -m "not integration" -x
+        run: >
+          pytest tests/ -v --tb=short -m "not integration and not live" -x
+          --ignore=tests/test_eval_framework.py
+          --ignore=tests/test_follow_up_rewrite.py
+          --ignore=tests/test_prompt_classification.py
+
+      - name: Isolated eval and hook routing tests
+        run: >
+          pytest
+          tests/test_eval_framework.py
+          tests/test_follow_up_rewrite.py
+          tests/test_prompt_classification.py
+          -v --tb=short -x
 
       - name: MCP tool registration
         run: pytest tests/test_think_recall_integration.py::TestMCPToolCount -v --tb=short
diff --git a/pyproject.toml b/pyproject.toml
index 3596e450..4cf90dba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -118,5 +118,6 @@ asyncio_mode = "auto"
 pythonpath = ["src"]
 markers = [
     "integration: tests that need the real production DB (268K+ chunks)",
+    "live: tests that need the live production DB or mutable local hooks state",
     "slow: tests that load ML models or take >30s",
 ]
diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh
index 7816a195..e34c8e61 100755
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@@ -5,6 +5,7 @@ set -u -o pipefail
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 TEST_ROOT="${BRAINLAYER_TEST_ROOT:-$ROOT_DIR/tests}"
 BRAINLAYER_USE_UV="${BRAINLAYER_USE_UV:-1}"
+UNIT_MARK_EXPR="${BRAINLAYER_PYTEST_MARK_EXPR:-not integration and not live}"
 exit_status=0
 
 run_step() {
@@ -33,6 +34,23 @@ collect_bun_tests() {
   find "$TEST_ROOT" -type f -name "*.test.ts" | sort
 }
 
+collect_isolated_pytest_files() {
+  if [ ! -d "$TEST_ROOT" ]; then
+    return 0
+  fi
+
+  local candidate
+  for candidate in \
+    "$TEST_ROOT/test_eval_framework.py" \
+    "$TEST_ROOT/test_follow_up_rewrite.py" \
+    "$TEST_ROOT/test_prompt_classification.py"
+  do
+    if [ -f "$candidate" ]; then
+      printf '%s\n' "$candidate"
+    fi
+  done
+}
+
 collect_regression_shell_tests() {
   if [ ! -d "$TEST_ROOT" ]; then
     return 0
@@ -51,11 +69,33 @@ run_pytest() {
 
 cd "$ROOT_DIR"
 
-run_step "pytest unit suite" run_pytest "$TEST_ROOT/" -v --tb=short -m "not integration"
+isolated_pytest_files=()
+while IFS= read -r test_file; do
+  isolated_pytest_files+=("$test_file")
+done < <(collect_isolated_pytest_files)
+
+pytest_unit_cmd=(run_pytest "$TEST_ROOT/" -v --tb=short -m "$UNIT_MARK_EXPR")
+if [ "${#isolated_pytest_files[@]}" -gt 0 ]; then
+  for isolated_test in "${isolated_pytest_files[@]}"; do
+    pytest_unit_cmd+=("--ignore=$isolated_test")
+  done
+fi
+
+run_step "pytest unit suite" "${pytest_unit_cmd[@]}"
 run_step \
   "pytest MCP tool registration" \
   run_pytest "$TEST_ROOT/test_think_recall_integration.py::TestMCPToolCount" -v --tb=short
 
+if [ "${#isolated_pytest_files[@]}" -gt 0 ]; then
+  run_step \
+    "pytest isolated eval and hook routing" \
+    run_pytest "${isolated_pytest_files[@]}" -v --tb=short
+else
+  echo "==> pytest isolated eval and hook routing"
+  echo "SKIP: no isolated pytest files found under $TEST_ROOT"
+  echo
+fi
+
 bun_tests=()
 while IFS= read -r test_file; do
   bun_tests+=("$test_file")

From 816068012d561008a2f7b815ba4f3606a61f2bdb Mon Sep 17 00:00:00 2001
From: Cursor Agent <cursoragent@cursor.com>
Date: Mon, 27 Apr 2026 22:08:24 +0000
Subject: [PATCH 3/4] fix(test): relax Deepchecks drift threshold for small
 sample size

The K-S test with n=5 samples has a minimum detectable difference of 0.2
due to discrete CDF steps. The embeddings are identical (max diff 2.38e-07)
and pass the cosine similarity threshold (>0.999), so the 0.001 threshold
was overly strict. Adjusted to 0.25 to match K-S behavior at this sample size.

Co-authored-by: Etan Heyman <EtanHey@users.noreply.github.com>
---
 tests/regression/test_drift_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/regression/test_drift_detection.py b/tests/regression/test_drift_detection.py
index 9c3e14e2..402eacd3 100644
--- a/tests/regression/test_drift_detection.py
+++ b/tests/regression/test_drift_detection.py
@@ -35,7 +35,7 @@ def test_fixture_embeddings_pass_deepchecks_and_cosine_threshold() -> None:
     baseline_frame = _embedding_frame(baseline_rows)
     current_frame = _embedding_frame(current_rows)
     drift_check = FeatureDrift(min_samples=len(baseline_rows))
-    drift_check.add_condition_drift_score_less_than(max_allowed_numeric_score=0.001)
+    drift_check.add_condition_drift_score_less_than(max_allowed_numeric_score=0.25)
     result = drift_check.run(
         train_dataset=Dataset(baseline_frame, cat_features=[]),
         test_dataset=Dataset(current_frame, cat_features=[]),

From c9b8c588378b8bb4eb693a5e026a7ae1d9ad36a0 Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Tue, 28 Apr 2026 01:03:44 +0300
Subject: [PATCH 4/4] test: relax deepchecks drift threshold

---
 tests/regression/test_drift_detection.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/regression/test_drift_detection.py b/tests/regression/test_drift_detection.py
index 402eacd3..0518d649 100644
--- a/tests/regression/test_drift_detection.py
+++ b/tests/regression/test_drift_detection.py
@@ -35,7 +35,9 @@ def test_fixture_embeddings_pass_deepchecks_and_cosine_threshold() -> None:
     baseline_frame = _embedding_frame(baseline_rows)
     current_frame = _embedding_frame(current_rows)
     drift_check = FeatureDrift(min_samples=len(baseline_rows))
-    drift_check.add_condition_drift_score_less_than(max_allowed_numeric_score=0.25)
+    # With five rows, Deepchecks' KS-based numeric drift bottoms out around 0.2
+    # even when the distributions are effectively unchanged across platforms.
+    drift_check.add_condition_drift_score_less_than(max_allowed_numeric_score=0.21)
     result = drift_check.run(
         train_dataset=Dataset(baseline_frame, cat_features=[]),
         test_dataset=Dataset(current_frame, cat_features=[]),