-
Notifications
You must be signed in to change notification settings - Fork 6
test: add Deepchecks regression harness #259
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cabd410
f37b17d
8160680
c9b8c58
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -77,9 +77,13 @@ telemetry = [ | |
| "axiom-py>=0.10.0", # Axiom observability (optional — degrades gracefully) | ||
| ] | ||
| dev = [ | ||
| "deepchecks>=0.19.1", | ||
| "numpy<2", | ||
| "pytest>=7.0.0", | ||
| "pytest-asyncio>=0.21.0", | ||
| "ruff>=0.1.0", | ||
| "scikit-learn<1.6", | ||
|
Comment on lines
+81
to
+85
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
CI still runs Python 3.13 in Useful? React with 👍 / 👎. |
||
| "setuptools>=70.0.0,<81", | ||
| ] | ||
| docs = [ | ||
| "mkdocs-material>=9.5.0", | ||
|
|
@@ -114,5 +118,6 @@ asyncio_mode = "auto" | |
| pythonpath = ["src"] | ||
| markers = [ | ||
| "integration: tests that need the real production DB (268K+ chunks)", | ||
| "live: tests that need the live production DB or mutable local hooks state", | ||
| "slow: tests that load ML models or take >30s", | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ set -u -o pipefail | |
| ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | ||
| TEST_ROOT="${BRAINLAYER_TEST_ROOT:-$ROOT_DIR/tests}" | ||
| BRAINLAYER_USE_UV="${BRAINLAYER_USE_UV:-1}" | ||
| UNIT_MARK_EXPR="${BRAINLAYER_PYTEST_MARK_EXPR:-not integration and not live}" | ||
| exit_status=0 | ||
|
|
||
| run_step() { | ||
|
|
@@ -33,21 +34,68 @@ collect_bun_tests() { | |
| find "$TEST_ROOT" -type f -name "*.test.ts" | sort | ||
| } | ||
|
|
||
| collect_isolated_pytest_files() { | ||
| if [ ! -d "$TEST_ROOT" ]; then | ||
| return 0 | ||
| fi | ||
|
|
||
| local candidate | ||
| for candidate in \ | ||
| "$TEST_ROOT/test_eval_framework.py" \ | ||
| "$TEST_ROOT/test_follow_up_rewrite.py" \ | ||
| "$TEST_ROOT/test_prompt_classification.py" | ||
| do | ||
| if [ -f "$candidate" ]; then | ||
| printf '%s\n' "$candidate" | ||
| fi | ||
| done | ||
| } | ||
|
|
||
| collect_regression_shell_tests() { | ||
| if [ ! -d "$TEST_ROOT" ]; then | ||
| return 0 | ||
| fi | ||
|
|
||
| find "$TEST_ROOT" -type f -path "*/regression/*.sh" | sort | ||
| } | ||
|
|
||
| run_pytest() { | ||
| if [ "$BRAINLAYER_USE_UV" = "1" ] && command -v uv >/dev/null 2>&1; then | ||
| uv run pytest "$@" | ||
| uv run --extra dev pytest "$@" | ||
| else | ||
| pytest "$@" | ||
| fi | ||
| } | ||
|
|
||
| cd "$ROOT_DIR" | ||
|
|
||
| run_step "pytest unit suite" run_pytest "$TEST_ROOT/" -v --tb=short -m "not integration" | ||
| isolated_pytest_files=() | ||
| while IFS= read -r test_file; do | ||
| isolated_pytest_files+=("$test_file") | ||
| done < <(collect_isolated_pytest_files) | ||
|
|
||
| pytest_unit_cmd=(run_pytest "$TEST_ROOT/" -v --tb=short -m "$UNIT_MARK_EXPR") | ||
| if [ "${#isolated_pytest_files[@]}" -gt 0 ]; then | ||
| for isolated_test in "${isolated_pytest_files[@]}"; do | ||
| pytest_unit_cmd+=("--ignore=$isolated_test") | ||
| done | ||
| fi | ||
|
|
||
| run_step "pytest unit suite" "${pytest_unit_cmd[@]}" | ||
| run_step \ | ||
| "pytest MCP tool registration" \ | ||
| run_pytest "$TEST_ROOT/test_think_recall_integration.py::TestMCPToolCount" -v --tb=short | ||
|
|
||
| if [ "${#isolated_pytest_files[@]}" -gt 0 ]; then | ||
| run_step \ | ||
| "pytest isolated eval and hook routing" \ | ||
| run_pytest "${isolated_pytest_files[@]}" -v --tb=short | ||
| else | ||
| echo "==> pytest isolated eval and hook routing" | ||
| echo "SKIP: no isolated pytest files found under $TEST_ROOT" | ||
| echo | ||
| fi | ||
|
|
||
| bun_tests=() | ||
| while IFS= read -r test_file; do | ||
| bun_tests+=("$test_file") | ||
|
|
@@ -67,6 +115,21 @@ else | |
| echo | ||
| fi | ||
|
|
||
| shell_tests=() | ||
| while IFS= read -r test_file; do | ||
| shell_tests+=("$test_file") | ||
| done < <(collect_regression_shell_tests) | ||
|
|
||
| if [ "${#shell_tests[@]}" -gt 0 ]; then | ||
| for shell_test in "${shell_tests[@]}"; do | ||
| run_step "regression shell $(basename "$shell_test")" bash "$shell_test" | ||
|
Comment on lines
+123
to
+125
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This loop runs every regression shell script unconditionally, but the newly added Useful? React with 👍 / 👎. |
||
| done | ||
| else | ||
| echo "==> regression shell suite" | ||
| echo "SKIP: no regression shell scripts found under $TEST_ROOT" | ||
| echo | ||
| fi | ||
|
|
||
| if [ "$exit_status" -ne 0 ]; then | ||
| echo "BrainLayer test gate failed." | ||
| else | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| """Regression harness tests for ecosystem-level fixtures.""" |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,112 @@ | ||
| """Helpers for the stale index regression fixture.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| import sqlite3 | ||
| from pathlib import Path | ||
| from tempfile import NamedTemporaryFile | ||
| from typing import Any | ||
|
|
||
| from brainlayer.embeddings import get_embedding_model | ||
|
|
||
| FIXTURE_PATH = Path(__file__).resolve().parent.parent / "fixtures" / "stale_index_query.json" | ||
|
|
||
|
|
||
| def load_fixture() -> dict[str, Any]: | ||
| """Load the seeded stale-index regression fixture.""" | ||
| return json.loads(FIXTURE_PATH.read_text()) | ||
|
|
||
|
|
||
| def cosine_similarity(left: list[float], right: list[float]) -> float: | ||
| """Compute cosine similarity without adding another numeric dependency.""" | ||
| if len(left) != len(right): | ||
| raise ValueError(f"embedding length mismatch: {len(left)} != {len(right)}") | ||
|
|
||
| dot_product = 0.0 | ||
| left_norm = 0.0 | ||
| right_norm = 0.0 | ||
| for left_value, right_value in zip(left, right): | ||
| dot_product += left_value * right_value | ||
| left_norm += left_value * left_value | ||
| right_norm += right_value * right_value | ||
|
|
||
| return dot_product / ((left_norm**0.5) * (right_norm**0.5)) | ||
|
|
||
|
|
||
| def create_fixture_db(db_path: Path) -> None: | ||
| """Seed a temporary SQLite FTS table from the fixture chunks.""" | ||
| fixture = load_fixture() | ||
| connection = sqlite3.connect(db_path) | ||
| try: | ||
| connection.execute( | ||
| """ | ||
| CREATE VIRTUAL TABLE chunks_fts USING fts5( | ||
| content, | ||
| summary, | ||
| tags, | ||
| resolved_query, | ||
| key_facts, | ||
| resolved_queries, | ||
| chunk_id UNINDEXED | ||
| ); | ||
| """ | ||
| ) | ||
| insert_sql = """ | ||
| INSERT INTO chunks_fts( | ||
| content, summary, tags, resolved_query, key_facts, resolved_queries, chunk_id | ||
| ) VALUES (?, ?, ?, ?, ?, ?, ?) | ||
| """ | ||
| for chunk in fixture["chunks"]: | ||
| connection.execute( | ||
| insert_sql, | ||
| ( | ||
| chunk["content"], | ||
| chunk.get("summary"), | ||
| json.dumps(chunk["tags"]) if chunk.get("tags") else None, | ||
| chunk.get("resolved_query"), | ||
| json.dumps(chunk["key_facts"]) if chunk.get("key_facts") else None, | ||
| json.dumps(chunk["resolved_queries"]) if chunk.get("resolved_queries") else None, | ||
| chunk["id"], | ||
| ), | ||
| ) | ||
| connection.commit() | ||
| finally: | ||
| connection.close() | ||
|
|
||
|
|
||
| def current_embedding_rows() -> list[list[float]]: | ||
| """Re-embed the fixture corpus with the current model.""" | ||
| fixture = load_fixture() | ||
| model = get_embedding_model() | ||
| encoder = model._load_model() | ||
| chunk_embeddings = encoder.encode( | ||
| [chunk["content"] for chunk in fixture["chunks"]], | ||
| convert_to_numpy=True, | ||
| show_progress_bar=False, | ||
| ).tolist() | ||
| sample_embedding = model.embed_query(fixture["sample_text"]["text"]) | ||
| return [[float(value) for value in row] for row in chunk_embeddings] + [sample_embedding] | ||
|
|
||
|
|
||
| def baseline_embedding_rows() -> list[list[float]]: | ||
| """Return the baseline embedding matrix stored in the fixture.""" | ||
| fixture = load_fixture() | ||
| chunk_embeddings = [[float(value) for value in chunk["embedding"]] for chunk in fixture["chunks"]] | ||
| sample_embedding = [float(value) for value in fixture["sample_text"]["baseline_embedding"]] | ||
| return chunk_embeddings + [sample_embedding] | ||
|
|
||
|
|
||
| def write_expected_ranking_json(output_path: Path) -> None: | ||
| """Write the baseline FTS rows to a normalized JSON file.""" | ||
| fixture = load_fixture() | ||
| output_path.write_text(json.dumps(fixture["query"]["baseline_rows"], indent=2, sort_keys=True) + "\n") | ||
|
|
||
|
|
||
| def create_temp_fixture_db() -> Path: | ||
| """Create a temporary seeded fixture DB and return its path.""" | ||
| temp_file = NamedTemporaryFile(prefix="brainlayer-stale-index-", suffix=".db", delete=False) | ||
| temp_path = Path(temp_file.name) | ||
| temp_file.close() | ||
| create_fixture_db(temp_path) | ||
| return temp_path | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unused helper exported from regression fixtureLow Severity The new Reviewed by Cursor Bugbot for commit 8160680. Configure here. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,47 @@ | ||
| """Deepchecks regression for stale-index embedding drift.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import pandas as pd | ||
| from deepchecks.tabular import Dataset | ||
| from deepchecks.tabular.checks import FeatureDrift | ||
|
|
||
| from tests.regression._stale_index_fixture import ( | ||
| baseline_embedding_rows, | ||
| cosine_similarity, | ||
| current_embedding_rows, | ||
| load_fixture, | ||
| ) | ||
|
|
||
|
|
||
| def _embedding_frame(rows: list[list[float]]) -> pd.DataFrame: | ||
| if not rows: | ||
| raise ValueError("embedding fixture rows must be non-empty") | ||
| width = len(rows[0]) | ||
| columns = [f"dim_{index:04d}" for index in range(width)] | ||
| return pd.DataFrame(rows, columns=columns) | ||
|
|
||
|
|
||
| def test_fixture_embeddings_pass_deepchecks_and_cosine_threshold() -> None: | ||
| fixture = load_fixture() | ||
| baseline_rows = baseline_embedding_rows() | ||
| current_rows = current_embedding_rows() | ||
| min_cosine_similarity = fixture["sample_text"]["min_cosine_similarity"] | ||
|
|
||
| assert len(baseline_rows) == len(current_rows) | ||
| for baseline_row, current_row in zip(baseline_rows, current_rows): | ||
| assert cosine_similarity(current_row, baseline_row) > min_cosine_similarity | ||
|
|
||
| baseline_frame = _embedding_frame(baseline_rows) | ||
| current_frame = _embedding_frame(current_rows) | ||
| drift_check = FeatureDrift(min_samples=len(baseline_rows)) | ||
| # With five rows, Deepchecks' KS-based numeric drift bottoms out around 0.2 | ||
| # even when the distributions are effectively unchanged across platforms. | ||
| drift_check.add_condition_drift_score_less_than(max_allowed_numeric_score=0.21) | ||
| result = drift_check.run( | ||
| train_dataset=Dataset(baseline_frame, cat_features=[]), | ||
| test_dataset=Dataset(current_frame, cat_features=[]), | ||
| with_display=False, | ||
| ) | ||
|
|
||
| assert result.passed_conditions() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Drift test loads heavy model in CI default laneMedium Severity
Additional Locations (1)Reviewed by Cursor Bugbot for commit c9b8c58. Configure here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. KS drift threshold near floor risks flaky failuresMedium Severity With only 5 rows per dataset, the KS statistic for any feature can take values 0, 0.2, 0.4, 0.6, 0.8, or 1.0. Setting Reviewed by Cursor Bugbot for commit c9b8c58. Configure here. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| set -euo pipefail | ||
|
|
||
| ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" | ||
| TMP_DIR="$(mktemp -d "${TMPDIR:-/tmp}/brainlayer-fts5-determinism.XXXXXX")" | ||
| DB_PATH="$TMP_DIR/stale-index.db" | ||
| EXPECTED_PATH="$TMP_DIR/expected.json" | ||
| ACTUAL_RAW_PATH="$TMP_DIR/actual.raw.json" | ||
| ACTUAL_PATH="$TMP_DIR/actual.json" | ||
|
|
||
| cleanup() { | ||
| rm -rf "$TMP_DIR" | ||
| } | ||
| trap cleanup EXIT | ||
|
|
||
| cd "$ROOT_DIR" | ||
|
|
||
| uv run python3 - <<'PY' "$DB_PATH" "$EXPECTED_PATH" | ||
| from pathlib import Path | ||
| import sys | ||
|
|
||
| from tests.regression._stale_index_fixture import create_fixture_db, write_expected_ranking_json | ||
|
|
||
| db_path = Path(sys.argv[1]) | ||
| expected_path = Path(sys.argv[2]) | ||
| create_fixture_db(db_path) | ||
| write_expected_ranking_json(expected_path) | ||
| PY | ||
|
|
||
| QUERY_SQL="$(uv run python3 - <<'PY' | ||
| from tests.regression._stale_index_fixture import load_fixture | ||
|
|
||
| print(load_fixture()["sqlite_snapshot"]["query_sql"]) | ||
| PY | ||
| )" | ||
|
|
||
| uvx --from sqlite-utils sqlite-utils query "$DB_PATH" "$QUERY_SQL" > "$ACTUAL_RAW_PATH" | ||
|
|
||
| uv run python3 - <<'PY' "$ACTUAL_RAW_PATH" "$ACTUAL_PATH" | ||
| import json | ||
| from pathlib import Path | ||
| import sys | ||
|
|
||
| raw_path = Path(sys.argv[1]) | ||
| output_path = Path(sys.argv[2]) | ||
| output_path.write_text(json.dumps(json.loads(raw_path.read_text()), indent=2, sort_keys=True) + "\n") | ||
| PY | ||
|
|
||
| diff -u "$EXPECTED_PATH" "$ACTUAL_PATH" |


There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
numpy<2 pin breaks Python 3.13 install
High Severity
The new dev extra constraint
numpy<2is incompatible with Python 3.13, which is included in the CI matrix. NumPy 1.x (latest 1.26.x) only ships wheels for Python 3.9–3.12 and refuses to build from source on 3.13, sopip install -e ".[dev]"will fail on the 3.13 job before any tests run. The runtime dependencynumpy>=1.22,<3.0previously allowed NumPy 2.x, which does support 3.13.Additional Locations (1)
.github/workflows/ci.yml#L13-L14Reviewed by Cursor Bugbot for commit 8160680. Configure here.