From abdbbb2a66d91f833d8564ff0f597ca7e8459057 Mon Sep 17 00:00:00 2001 From: Etan Joseph Heyman Date: Sat, 4 Apr 2026 18:15:38 +0300 Subject: [PATCH 1/2] feat: promote high-signal tags into entities --- scripts/promote_tag_entities.py | 42 ++++ .../pipeline/tag_entity_promotion.py | 225 ++++++++++++++++++ src/brainlayer/vector_store.py | 6 + tests/test_tag_entity_promotion.py | 108 +++++++++ 4 files changed, 381 insertions(+) create mode 100644 scripts/promote_tag_entities.py create mode 100644 src/brainlayer/pipeline/tag_entity_promotion.py create mode 100644 tests/test_tag_entity_promotion.py diff --git a/scripts/promote_tag_entities.py b/scripts/promote_tag_entities.py new file mode 100644 index 00000000..dc83b1dd --- /dev/null +++ b/scripts/promote_tag_entities.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +"""Promote high-frequency concept tags into KG entities.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent / "src")) + +from brainlayer.paths import get_db_path +from brainlayer.pipeline.tag_entity_promotion import promote_tag_entities +from brainlayer.vector_store import VectorStore + + +def main() -> int: + parser = argparse.ArgumentParser(description="Promote high-frequency chunk tags into KG entities") + parser.add_argument("--min-count", type=int, default=500, help="Minimum tagged chunk count to promote") + parser.add_argument("--limit", type=int, default=None, help="Optional candidate limit") + parser.add_argument("--dry-run", action="store_true", help="Show candidates without writing") + args = parser.parse_args() + + store = None + try: + store = VectorStore(get_db_path()) + stats = promote_tag_entities( + store, + min_count=args.min_count, + limit=args.limit, + dry_run=args.dry_run, + ) + print(json.dumps(stats, indent=2, sort_keys=True)) + return 0 + finally: + if store is not None: + store.close() + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/src/brainlayer/pipeline/tag_entity_promotion.py b/src/brainlayer/pipeline/tag_entity_promotion.py new file mode 100644 index 00000000..efedb0e5 --- /dev/null +++ b/src/brainlayer/pipeline/tag_entity_promotion.py @@ -0,0 +1,225 @@ +"""Promote high-frequency concept tags into KG entities.""" + +from __future__ import annotations + +import re +from typing import Any + +from ..vector_store import VectorStore + +ACTIVITY_TAG_PREFIXES = ("act:", "dom:", "meta/") +ACTIVITY_TAGS = { + "debugging", + "testing", + "refactoring", + "code-review", + "bug-fix", + "feature-dev", + "configuration", + "documentation", + "project-management", + "error-handling", + "task-management", + "deployment", + "workflow", + "automation", + "verification", + "command-line", + "planning", + "tooling", + "file-system", + "file-management", + "scripting", + "monitoring", + "assistant-action", + "status-update", + "version-control", + "implementation", + "collaboration", + "discussion", + "code-analysis", + "metadata", + "architecture", + "styling", + "confirmation", + "troubleshooting", + "design", + "frontend", + "backend", + "command", + "shell", + "bash", + "grep", + "json", + "regex", + "html", + "css", + "svg", +} + +PERSON_TAGS = { + "andrew-huberman", + "avi-simon", + "daniel-munk", + "dor-zohar", + "etan-heyman", + "joshua-anderson", + "maor-noah", + "shachar-gerby", + "theo-browne", + "yuval-nir", +} + +TECHNOLOGY_TAGS = { + "1password", + "convex", + "docker", + "javascript", + "nextjs", + "openai", + "postgres", + "python", + "railway", + "react", + "sqlite", + "supabase", + "telegram", + "typescript", + "whatsapp", +} + +TOPIC_TAGS = { + "cold-exposure", + "dopamine", + "exercise", + "fitness", + "metabolism", + "neuroscience", + "nutrition", + "psychology", + "supplements", + "wellness", +} + +HEBREW_MARKERS = {"hebrew", "עברית", "rtl", "right-to-left"} +COMMUNITY_MARKERS = {"community", "collective", "crew", "forum", "group", "guild", "network"} + + +def _slugify_tag(tag: str) -> str: + normalized = re.sub(r"[^a-z0-9]+", "-", tag.lower()).strip("-") + return re.sub(r"-{2,}", "-", normalized) + + +def classify_tag_entity_type(tag: str) -> str: + """Infer an entity type from a promoted tag.""" + normalized = _slugify_tag(tag) + + if normalized in PERSON_TAGS: + return "person" + if normalized in TECHNOLOGY_TAGS: + return "technology" + if any(marker in normalized for marker in COMMUNITY_MARKERS): + return "community" + if normalized in TOPIC_TAGS or any(marker in normalized for marker in HEBREW_MARKERS): + return "topic" + return "topic" + + +def find_promotion_candidates(store: VectorStore, min_count: int = 500, limit: int | None = None) -> list[dict[str, Any]]: + """Find high-frequency concept tags worth promoting to entities.""" + cursor = store._read_cursor() + placeholders = ", ".join("?" for _ in ACTIVITY_TAGS) + query = f""" + SELECT ct.tag, COUNT(*) as cnt + FROM chunk_tags ct + LEFT JOIN kg_entities e ON lower(e.name) = lower(ct.tag) + WHERE e.id IS NULL + AND ct.tag IS NOT NULL + AND ct.tag != '' + AND ct.tag NOT LIKE 'act:%' + AND ct.tag NOT LIKE 'dom:%' + AND ct.tag NOT LIKE 'meta/%' + AND lower(ct.tag) NOT IN ({placeholders}) + GROUP BY ct.tag + HAVING COUNT(*) >= ? + ORDER BY cnt DESC, ct.tag ASC + """ + params: list[Any] = [tag.lower() for tag in sorted(ACTIVITY_TAGS)] + params.append(min_count) + if limit is not None: + query += " LIMIT ?" + params.append(limit) + + rows = list(cursor.execute(query, params)) + return [ + { + "tag": row[0], + "count": row[1], + "entity_type": classify_tag_entity_type(row[0]), + } + for row in rows + ] + + +def promote_tag_entities( + store: VectorStore, + min_count: int = 500, + limit: int | None = None, + dry_run: bool = False, +) -> dict[str, Any]: + """Promote high-frequency tags into KG entities and link matching chunks.""" + candidates = find_promotion_candidates(store, min_count=min_count, limit=limit) + stats = { + "candidates": len(candidates), + "entities_created": 0, + "links_created": 0, + "promoted_tags": [candidate["tag"] for candidate in candidates], + } + if dry_run: + return stats + + cursor = store.conn.cursor() + kg_entity_chunk_cols = {row[1] for row in cursor.execute("PRAGMA table_info(kg_entity_chunks)")} + has_mention_type = "mention_type" in kg_entity_chunk_cols + + for candidate in candidates: + tag = candidate["tag"] + entity_type = candidate["entity_type"] + entity_id = f"auto-tag-{_slugify_tag(tag)}" + existing = store.get_entity_by_name(entity_type, tag) + if existing is None: + store.upsert_entity( + entity_id, + entity_type, + tag, + metadata={"source": "tag-promotion", "tag_count": candidate["count"]}, + confidence=0.8, + importance=0.6, + ) + stats["entities_created"] += 1 + else: + entity_id = existing["id"] + + if has_mention_type: + cursor.execute( + """ + INSERT OR IGNORE INTO kg_entity_chunks (entity_id, chunk_id, relevance, context, mention_type) + SELECT ?, ct.chunk_id, 0.8, 'tag-promotion', 'tag' + FROM chunk_tags ct + WHERE ct.tag = ? + """, + (entity_id, tag), + ) + else: + cursor.execute( + """ + INSERT OR IGNORE INTO kg_entity_chunks (entity_id, chunk_id, relevance, context) + SELECT ?, ct.chunk_id, 0.8, 'tag-promotion' + FROM chunk_tags ct + WHERE ct.tag = ? + """, + (entity_id, tag), + ) + stats["links_created"] += store.conn.changes() + + return stats diff --git a/src/brainlayer/vector_store.py b/src/brainlayer/vector_store.py index b5a395ea..67016140 100644 --- a/src/brainlayer/vector_store.py +++ b/src/brainlayer/vector_store.py @@ -627,6 +627,12 @@ def _init_db(self) -> None: ("tool", "entity", "Software tool or service"), ("project", "entity", "Software project or initiative"), ("concept", "entity", "Abstract concept, pattern, or domain idea"), + ("topic", "concept", "Recurring subject or thematic area"), + ("protocol", "topic", "Named workflow or protocol"), + ("community", "entity", "Community, audience, or social group"), + ("health_metric", "topic", "Health or wellness metric"), + ("workflow", "concept", "Repeatable workflow or process"), + ("device", "entity", "Hardware device or machine"), ("event", "entity", "Temporal event or occurrence"), ("organization", "entity", "Company or group"), ("golem", "agent", "Specialized AI agent in the golems ecosystem"), diff --git a/tests/test_tag_entity_promotion.py b/tests/test_tag_entity_promotion.py new file mode 100644 index 00000000..c3d8078c --- /dev/null +++ b/tests/test_tag_entity_promotion.py @@ -0,0 +1,108 @@ +"""Tests for tag-to-entity promotion pipeline.""" + +import json + +import pytest + +from brainlayer.vector_store import VectorStore + + +@pytest.fixture +def store(tmp_path): + db_path = tmp_path / "test.db" + s = VectorStore(db_path) + yield s + s.close() + + +def _insert_chunk_with_tags(store, chunk_id, tags): + cursor = store.conn.cursor() + cursor.execute( + """INSERT INTO chunks ( + id, content, metadata, source_file, project, content_type, + char_count, source, tags, created_at + ) VALUES (?, ?, '{}', 'test.jsonl', 'brainlayer', 'assistant_text', ?, 'tests', ?, datetime('now'))""", + ( + chunk_id, + f"content for {chunk_id}", + len(chunk_id), + json.dumps(tags), + ), + ) + + +class TestTagPromotionHeuristics: + def test_classify_tag_entity_type_uses_spec_heuristics(self): + from brainlayer.pipeline.tag_entity_promotion import classify_tag_entity_type + + assert classify_tag_entity_type("telegram") == "technology" + assert classify_tag_entity_type("andrew-huberman") == "person" + assert classify_tag_entity_type("neuroscience") == "topic" + assert classify_tag_entity_type("hebrew-writing") == "topic" + assert classify_tag_entity_type("founders-community") == "community" + assert classify_tag_entity_type("morning-routine") == "topic" + + +class TestTagPromotionCandidates: + def test_find_candidates_skips_existing_and_activity_tags(self, store): + from brainlayer.pipeline.tag_entity_promotion import find_promotion_candidates + + _insert_chunk_with_tags(store, "chunk-1", ["telegram", "debugging", "existing-topic"]) + _insert_chunk_with_tags(store, "chunk-2", ["telegram", "debugging", "existing-topic"]) + store.upsert_entity("existing-topic", "topic", "existing-topic") + + candidates = find_promotion_candidates(store, min_count=2) + + assert [candidate["tag"] for candidate in candidates] == ["telegram"] + + +class TestTagPromotionExecution: + def test_promote_tag_candidates_creates_entities_and_links_chunks(self, store): + from brainlayer.pipeline.tag_entity_promotion import promote_tag_entities + + _insert_chunk_with_tags(store, "chunk-1", ["telegram", "feature-dev"]) + _insert_chunk_with_tags(store, "chunk-2", ["telegram"]) + _insert_chunk_with_tags(store, "chunk-3", ["neuroscience"]) + _insert_chunk_with_tags(store, "chunk-4", ["neuroscience"]) + + stats = promote_tag_entities(store, min_count=2) + + assert stats["candidates"] == 2 + assert stats["entities_created"] == 2 + assert stats["links_created"] == 4 + + cursor = store._read_cursor() + entities = { + row[0]: row[1] + for row in cursor.execute( + "SELECT name, entity_type FROM kg_entities WHERE id LIKE 'auto-tag-%'" + ) + } + assert entities["telegram"] == "technology" + assert entities["neuroscience"] == "topic" + + links = list( + cursor.execute( + "SELECT entity_id, chunk_id, mention_type FROM kg_entity_chunks WHERE entity_id LIKE 'auto-tag-%'" + ) + ) + assert len(links) == 4 + assert {row[2] for row in links} == {"tag"} + + def test_vector_store_seeds_new_entity_types(self, store): + cursor = store._read_cursor() + rows = list( + cursor.execute( + "SELECT child_type, parent_type FROM entity_type_hierarchy WHERE child_type IN (?, ?, ?, ?, ?, ?)", + ("topic", "protocol", "community", "health_metric", "workflow", "device"), + ) + ) + + assert dict(rows) == { + "topic": "concept", + "protocol": "topic", + "community": "entity", + "health_metric": "topic", + "workflow": "concept", + "device": "entity", + } From f2d1a35dcec0798366cdf41b589a6ac7cf27d045 Mon Sep 17 00:00:00 2001 From: Etan Joseph Heyman Date: Sat, 4 Apr 2026 18:22:42 +0300 Subject: [PATCH 2/2] fix: remove dead hebrew marker --- src/brainlayer/pipeline/tag_entity_promotion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/brainlayer/pipeline/tag_entity_promotion.py b/src/brainlayer/pipeline/tag_entity_promotion.py index efedb0e5..26fa634d 100644 --- a/src/brainlayer/pipeline/tag_entity_promotion.py +++ b/src/brainlayer/pipeline/tag_entity_promotion.py @@ -101,7 +101,7 @@ "wellness", } -HEBREW_MARKERS = {"hebrew", "עברית", "rtl", "right-to-left"} +HEBREW_MARKERS = {"hebrew", "rtl", "right-to-left"} COMMUNITY_MARKERS = {"community", "collective", "crew", "forum", "group", "guild", "network"}