-
Notifications
You must be signed in to change notification settings - Fork 7
feat: promote concept tags into KG entities #199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| #!/usr/bin/env python3 | ||
| """Promote high-frequency concept tags into KG entities.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import argparse | ||
| import json | ||
| import sys | ||
| from pathlib import Path | ||
|
|
||
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) | ||
|
|
||
| from brainlayer.paths import get_db_path | ||
| from brainlayer.pipeline.tag_entity_promotion import promote_tag_entities | ||
| from brainlayer.vector_store import VectorStore | ||
|
|
||
|
|
||
| def main() -> int: | ||
| parser = argparse.ArgumentParser(description="Promote high-frequency chunk tags into KG entities") | ||
| parser.add_argument("--min-count", type=int, default=500, help="Minimum tagged chunk count to promote") | ||
| parser.add_argument("--limit", type=int, default=None, help="Optional candidate limit") | ||
| parser.add_argument("--dry-run", action="store_true", help="Show candidates without writing") | ||
| args = parser.parse_args() | ||
|
|
||
| store = None | ||
| try: | ||
| store = VectorStore(get_db_path()) | ||
| stats = promote_tag_entities( | ||
| store, | ||
| min_count=args.min_count, | ||
| limit=args.limit, | ||
| dry_run=args.dry_run, | ||
| ) | ||
| print(json.dumps(stats, indent=2, sort_keys=True)) | ||
| return 0 | ||
| finally: | ||
| if store is not None: | ||
| store.close() | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| raise SystemExit(main()) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,225 @@ | ||
| """Promote high-frequency concept tags into KG entities.""" | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| from typing import Any | ||
|
|
||
| from ..vector_store import VectorStore | ||
|
|
||
| ACTIVITY_TAG_PREFIXES = ("act:", "dom:", "meta/") | ||
| ACTIVITY_TAGS = { | ||
| "debugging", | ||
| "testing", | ||
| "refactoring", | ||
| "code-review", | ||
| "bug-fix", | ||
| "feature-dev", | ||
| "configuration", | ||
| "documentation", | ||
| "project-management", | ||
| "error-handling", | ||
| "task-management", | ||
| "deployment", | ||
| "workflow", | ||
| "automation", | ||
| "verification", | ||
| "command-line", | ||
| "planning", | ||
| "tooling", | ||
| "file-system", | ||
| "file-management", | ||
| "scripting", | ||
| "monitoring", | ||
| "assistant-action", | ||
| "status-update", | ||
| "version-control", | ||
| "implementation", | ||
| "collaboration", | ||
| "discussion", | ||
| "code-analysis", | ||
| "metadata", | ||
| "architecture", | ||
| "styling", | ||
| "confirmation", | ||
| "troubleshooting", | ||
| "design", | ||
| "frontend", | ||
| "backend", | ||
| "command", | ||
| "shell", | ||
| "bash", | ||
| "grep", | ||
| "json", | ||
| "regex", | ||
| "html", | ||
| "css", | ||
| "svg", | ||
| } | ||
|
|
||
| PERSON_TAGS = { | ||
| "andrew-huberman", | ||
| "avi-simon", | ||
| "daniel-munk", | ||
| "dor-zohar", | ||
| "etan-heyman", | ||
| "joshua-anderson", | ||
| "maor-noah", | ||
| "shachar-gerby", | ||
| "theo-browne", | ||
| "yuval-nir", | ||
| } | ||
|
|
||
| TECHNOLOGY_TAGS = { | ||
| "1password", | ||
| "convex", | ||
| "docker", | ||
| "javascript", | ||
| "nextjs", | ||
| "openai", | ||
| "postgres", | ||
| "python", | ||
| "railway", | ||
| "react", | ||
| "sqlite", | ||
| "supabase", | ||
| "telegram", | ||
| "typescript", | ||
| "whatsapp", | ||
| } | ||
|
|
||
| TOPIC_TAGS = { | ||
| "cold-exposure", | ||
| "dopamine", | ||
| "exercise", | ||
| "fitness", | ||
| "metabolism", | ||
| "neuroscience", | ||
| "nutrition", | ||
| "psychology", | ||
| "supplements", | ||
| "wellness", | ||
| } | ||
|
|
||
| HEBREW_MARKERS = {"hebrew", "rtl", "right-to-left"} | ||
| COMMUNITY_MARKERS = {"community", "collective", "crew", "forum", "group", "guild", "network"} | ||
|
|
||
|
|
||
| def _slugify_tag(tag: str) -> str: | ||
| normalized = re.sub(r"[^a-z0-9]+", "-", tag.lower()).strip("-") | ||
| return re.sub(r"-{2,}", "-", normalized) | ||
|
|
||
|
|
||
| def classify_tag_entity_type(tag: str) -> str: | ||
| """Infer an entity type from a promoted tag.""" | ||
| normalized = _slugify_tag(tag) | ||
|
|
||
| if normalized in PERSON_TAGS: | ||
| return "person" | ||
| if normalized in TECHNOLOGY_TAGS: | ||
| return "technology" | ||
| if any(marker in normalized for marker in COMMUNITY_MARKERS): | ||
| return "community" | ||
| if normalized in TOPIC_TAGS or any(marker in normalized for marker in HEBREW_MARKERS): | ||
| return "topic" | ||
| return "topic" | ||
|
|
||
|
|
||
| def find_promotion_candidates(store: VectorStore, min_count: int = 500, limit: int | None = None) -> list[dict[str, Any]]: | ||
| """Find high-frequency concept tags worth promoting to entities.""" | ||
| cursor = store._read_cursor() | ||
| placeholders = ", ".join("?" for _ in ACTIVITY_TAGS) | ||
| query = f""" | ||
| SELECT ct.tag, COUNT(*) as cnt | ||
| FROM chunk_tags ct | ||
| LEFT JOIN kg_entities e ON lower(e.name) = lower(ct.tag) | ||
| WHERE e.id IS NULL | ||
| AND ct.tag IS NOT NULL | ||
| AND ct.tag != '' | ||
| AND ct.tag NOT LIKE 'act:%' | ||
| AND ct.tag NOT LIKE 'dom:%' | ||
| AND ct.tag NOT LIKE 'meta/%' | ||
| AND lower(ct.tag) NOT IN ({placeholders}) | ||
| GROUP BY ct.tag | ||
| HAVING COUNT(*) >= ? | ||
| ORDER BY cnt DESC, ct.tag ASC | ||
| """ | ||
|
Comment on lines
+132
to
+146
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Normalize tag keys in SQL to prevent split counts and missed links. Line [133]-Line [146] aggregates by raw Proposed normalization fix- query = f"""
- SELECT ct.tag, COUNT(*) as cnt
+ query = f"""
+ SELECT lower(trim(ct.tag)) AS normalized_tag, COUNT(*) as cnt
FROM chunk_tags ct
- LEFT JOIN kg_entities e ON lower(e.name) = lower(ct.tag)
+ LEFT JOIN kg_entities e ON lower(e.name) = lower(trim(ct.tag))
WHERE e.id IS NULL
AND ct.tag IS NOT NULL
AND ct.tag != ''
AND ct.tag NOT LIKE 'act:%'
AND ct.tag NOT LIKE 'dom:%'
AND ct.tag NOT LIKE 'meta/%'
- AND lower(ct.tag) NOT IN ({placeholders})
- GROUP BY ct.tag
+ AND lower(trim(ct.tag)) NOT IN ({placeholders})
+ GROUP BY normalized_tag
HAVING COUNT(*) >= ?
- ORDER BY cnt DESC, ct.tag ASC
+ ORDER BY cnt DESC, normalized_tag ASC
"""
@@
- "tag": row[0],
+ "tag": row[0],
"count": row[1],
"entity_type": classify_tag_entity_type(row[0]),
@@
- WHERE ct.tag = ?
+ WHERE lower(trim(ct.tag)) = ?
""",
- (entity_id, tag),
+ (entity_id, tag.lower().strip()),
)
@@
- WHERE ct.tag = ?
+ WHERE lower(trim(ct.tag)) = ?
""",
- (entity_id, tag),
+ (entity_id, tag.lower().strip()),
)Also applies to: 153-161, 203-222 🤖 Prompt for AI Agents |
||
| params: list[Any] = [tag.lower() for tag in sorted(ACTIVITY_TAGS)] | ||
| params.append(min_count) | ||
| if limit is not None: | ||
| query += " LIMIT ?" | ||
| params.append(limit) | ||
|
|
||
| rows = list(cursor.execute(query, params)) | ||
| return [ | ||
| { | ||
| "tag": row[0], | ||
| "count": row[1], | ||
| "entity_type": classify_tag_entity_type(row[0]), | ||
| } | ||
| for row in rows | ||
| ] | ||
|
|
||
|
|
||
| def promote_tag_entities( | ||
| store: VectorStore, | ||
| min_count: int = 500, | ||
| limit: int | None = None, | ||
| dry_run: bool = False, | ||
| ) -> dict[str, Any]: | ||
| """Promote high-frequency tags into KG entities and link matching chunks.""" | ||
| candidates = find_promotion_candidates(store, min_count=min_count, limit=limit) | ||
| stats = { | ||
| "candidates": len(candidates), | ||
| "entities_created": 0, | ||
| "links_created": 0, | ||
| "promoted_tags": [candidate["tag"] for candidate in candidates], | ||
| } | ||
| if dry_run: | ||
| return stats | ||
|
|
||
| cursor = store.conn.cursor() | ||
| kg_entity_chunk_cols = {row[1] for row in cursor.execute("PRAGMA table_info(kg_entity_chunks)")} | ||
| has_mention_type = "mention_type" in kg_entity_chunk_cols | ||
|
|
||
| for candidate in candidates: | ||
| tag = candidate["tag"] | ||
| entity_type = candidate["entity_type"] | ||
| entity_id = f"auto-tag-{_slugify_tag(tag)}" | ||
| existing = store.get_entity_by_name(entity_type, tag) | ||
| if existing is None: | ||
| store.upsert_entity( | ||
| entity_id, | ||
| entity_type, | ||
| tag, | ||
| metadata={"source": "tag-promotion", "tag_count": candidate["count"]}, | ||
| confidence=0.8, | ||
| importance=0.6, | ||
| ) | ||
| stats["entities_created"] += 1 | ||
| else: | ||
| entity_id = existing["id"] | ||
|
|
||
| if has_mention_type: | ||
| cursor.execute( | ||
| """ | ||
| INSERT OR IGNORE INTO kg_entity_chunks (entity_id, chunk_id, relevance, context, mention_type) | ||
| SELECT ?, ct.chunk_id, 0.8, 'tag-promotion', 'tag' | ||
| FROM chunk_tags ct | ||
| WHERE ct.tag = ? | ||
| """, | ||
| (entity_id, tag), | ||
| ) | ||
| else: | ||
| cursor.execute( | ||
| """ | ||
| INSERT OR IGNORE INTO kg_entity_chunks (entity_id, chunk_id, relevance, context) | ||
| SELECT ?, ct.chunk_id, 0.8, 'tag-promotion' | ||
| FROM chunk_tags ct | ||
| WHERE ct.tag = ? | ||
| """, | ||
| (entity_id, tag), | ||
| ) | ||
| stats["links_created"] += store.conn.changes() | ||
|
Comment on lines
+181
to
+223
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Protect promotion writes with exclusive-write orchestration and BUSY retries. This function performs bulk writes ( As per coding guidelines: "Enforce one-write-at-a-time concurrency constraint; reads are safe but brain_digest is write-heavy and must not run in parallel with other MCP work" and "Never run bulk database operations while enrichment workers are writing; always stop workers and checkpoint WAL first." 🤖 Prompt for AI Agents |
||
|
|
||
| return stats | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,108 @@ | ||
| """Tests for tag-to-entity promotion pipeline.""" | ||
|
|
||
| import json | ||
|
|
||
| import pytest | ||
|
|
||
| from brainlayer.vector_store import VectorStore | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def store(tmp_path): | ||
| db_path = tmp_path / "test.db" | ||
| s = VectorStore(db_path) | ||
| yield s | ||
| s.close() | ||
|
|
||
|
|
||
| def _insert_chunk_with_tags(store, chunk_id, tags): | ||
| cursor = store.conn.cursor() | ||
| cursor.execute( | ||
| """INSERT INTO chunks ( | ||
| id, content, metadata, source_file, project, content_type, | ||
| char_count, source, tags, created_at | ||
| ) VALUES (?, ?, '{}', 'test.jsonl', 'brainlayer', 'assistant_text', ?, 'tests', ?, datetime('now'))""", | ||
| ( | ||
| chunk_id, | ||
| f"content for {chunk_id}", | ||
| len(chunk_id), | ||
| json.dumps(tags), | ||
| ), | ||
| ) | ||
|
|
||
|
|
||
| class TestTagPromotionHeuristics: | ||
| def test_classify_tag_entity_type_uses_spec_heuristics(self): | ||
| from brainlayer.pipeline.tag_entity_promotion import classify_tag_entity_type | ||
|
|
||
| assert classify_tag_entity_type("telegram") == "technology" | ||
| assert classify_tag_entity_type("andrew-huberman") == "person" | ||
| assert classify_tag_entity_type("neuroscience") == "topic" | ||
| assert classify_tag_entity_type("hebrew-writing") == "topic" | ||
| assert classify_tag_entity_type("founders-community") == "community" | ||
| assert classify_tag_entity_type("morning-routine") == "topic" | ||
|
|
||
|
|
||
| class TestTagPromotionCandidates: | ||
| def test_find_candidates_skips_existing_and_activity_tags(self, store): | ||
| from brainlayer.pipeline.tag_entity_promotion import find_promotion_candidates | ||
|
|
||
| _insert_chunk_with_tags(store, "chunk-1", ["telegram", "debugging", "existing-topic"]) | ||
| _insert_chunk_with_tags(store, "chunk-2", ["telegram", "debugging", "existing-topic"]) | ||
| store.upsert_entity("existing-topic", "topic", "existing-topic") | ||
|
|
||
| candidates = find_promotion_candidates(store, min_count=2) | ||
|
|
||
| assert [candidate["tag"] for candidate in candidates] == ["telegram"] | ||
|
|
||
|
Comment on lines
+47
to
+57
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Add a mixed-case tag regression test. Current scenarios only use lowercase tags. Please add coverage for variants like Also applies to: 60-91 🤖 Prompt for AI Agents |
||
|
|
||
| class TestTagPromotionExecution: | ||
| def test_promote_tag_candidates_creates_entities_and_links_chunks(self, store): | ||
| from brainlayer.pipeline.tag_entity_promotion import promote_tag_entities | ||
|
|
||
| _insert_chunk_with_tags(store, "chunk-1", ["telegram", "feature-dev"]) | ||
| _insert_chunk_with_tags(store, "chunk-2", ["telegram"]) | ||
| _insert_chunk_with_tags(store, "chunk-3", ["neuroscience"]) | ||
| _insert_chunk_with_tags(store, "chunk-4", ["neuroscience"]) | ||
|
|
||
| stats = promote_tag_entities(store, min_count=2) | ||
|
|
||
| assert stats["candidates"] == 2 | ||
| assert stats["entities_created"] == 2 | ||
| assert stats["links_created"] == 4 | ||
|
|
||
| cursor = store._read_cursor() | ||
| entities = { | ||
| row[0]: row[1] | ||
| for row in cursor.execute( | ||
| "SELECT name, entity_type FROM kg_entities WHERE id LIKE 'auto-tag-%'" | ||
| ) | ||
| } | ||
| assert entities["telegram"] == "technology" | ||
| assert entities["neuroscience"] == "topic" | ||
|
|
||
| links = list( | ||
| cursor.execute( | ||
| "SELECT entity_id, chunk_id, mention_type FROM kg_entity_chunks WHERE entity_id LIKE 'auto-tag-%'" | ||
| ) | ||
| ) | ||
| assert len(links) == 4 | ||
| assert {row[2] for row in links} == {"tag"} | ||
|
|
||
| def test_vector_store_seeds_new_entity_types(self, store): | ||
| cursor = store._read_cursor() | ||
| rows = list( | ||
| cursor.execute( | ||
| "SELECT child_type, parent_type FROM entity_type_hierarchy WHERE child_type IN (?, ?, ?, ?, ?, ?)", | ||
| ("topic", "protocol", "community", "health_metric", "workflow", "device"), | ||
| ) | ||
| ) | ||
|
|
||
| assert dict(rows) == { | ||
| "topic": "concept", | ||
| "protocol": "topic", | ||
| "community": "entity", | ||
| "health_metric": "topic", | ||
| "workflow": "concept", | ||
| "device": "entity", | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add explicit DB-target and write confirmation for non-dry runs.
Line [27] currently opens the canonical DB implicitly. A mistaken shell/env can write to the wrong database with no guard.
Proposed hardening
def main() -> int: parser = argparse.ArgumentParser(description="Promote high-frequency chunk tags into KG entities") + parser.add_argument("--db-path", type=Path, default=None, help="Override DB path (defaults to get_db_path())") parser.add_argument("--min-count", type=int, default=500, help="Minimum tagged chunk count to promote") parser.add_argument("--limit", type=int, default=None, help="Optional candidate limit") parser.add_argument("--dry-run", action="store_true", help="Show candidates without writing") + parser.add_argument("--yes", action="store_true", help="Confirm writes when not using --dry-run") args = parser.parse_args() + if not args.dry_run and not args.yes: + parser.error("Refusing to write without --yes (or use --dry-run).") + + db_path = args.db_path or get_db_path() store = None try: - store = VectorStore(get_db_path()) + store = VectorStore(db_path) stats = promote_tag_entities( store, min_count=args.min_count, limit=args.limit, dry_run=args.dry_run, )Also applies to: 34-35
🤖 Prompt for AI Agents