-
Notifications
You must be signed in to change notification settings - Fork 7
feat: co-occurrence relation extraction for KG edges #178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -497,6 +497,40 @@ def extract_entities_from_tags( | |
| return entities | ||
|
|
||
|
|
||
| def extract_cooccurrence_relations(entities: list[ExtractedEntity]) -> list[ExtractedRelation]: | ||
| """Infer co-occurrence relations between entities of different types. | ||
|
|
||
| Two entities in the same text that have different types are assumed to be | ||
| related (e.g., a project uses a technology). This is a low-cost heuristic | ||
| that runs without any LLM, producing edges for the knowledge graph. | ||
|
|
||
| Only cross-type pairs are linked — same-type pairs (project-project) are | ||
| skipped as too noisy. | ||
| """ | ||
| relations: list[ExtractedRelation] = [] | ||
| seen: set[tuple[str, str]] = set() | ||
|
|
||
| for i, a in enumerate(entities): | ||
| for b in entities[i + 1 :]: | ||
| if a.entity_type == b.entity_type: | ||
| continue | ||
| pair = (a.text, b.text) if a.text < b.text else (b.text, a.text) | ||
| if pair in seen: | ||
| continue | ||
| seen.add(pair) | ||
|
Comment on lines
+513
to
+520
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Potential quadratic blowup for large entity lists. For Consider capping output (e.g., top-k by confidence) or limiting to entities above a confidence threshold. This is a minor concern for typical chunk sizes but could degrade performance on outlier chunks. 🛡️ Optional: Add a cap on relation count+MAX_COOCCURRENCE_RELATIONS = 50 # Cap to avoid noise explosion
+
def extract_cooccurrence_relations(entities: list[ExtractedEntity]) -> list[ExtractedRelation]:
...
relations: list[ExtractedRelation] = []
seen: set[tuple[str, str]] = set()
for i, a in enumerate(entities):
for b in entities[i + 1 :]:
+ if len(relations) >= MAX_COOCCURRENCE_RELATIONS:
+ break
if a.entity_type == b.entity_type:
continue
...
+ if len(relations) >= MAX_COOCCURRENCE_RELATIONS:
+ break
return relations🤖 Prompt for AI Agents |
||
| confidence = min(a.confidence, b.confidence) * 0.7 | ||
| relations.append( | ||
| ExtractedRelation( | ||
| source_text=a.text, | ||
| target_text=b.text, | ||
| relation_type="co_occurs_with", | ||
| confidence=confidence, | ||
| ) | ||
| ) | ||
|
|
||
| return relations | ||
|
|
||
|
|
||
| def extract_entities_combined( | ||
| text: str, | ||
| seed_entities: dict[str, list[str]], | ||
|
|
@@ -543,6 +577,10 @@ def extract_entities_combined( | |
|
|
||
| final_entities.sort(key=lambda e: e.start) | ||
|
|
||
| # 4. Co-occurrence relations (always runs — no LLM needed) | ||
| cooccurrence = extract_cooccurrence_relations(final_entities) | ||
| all_relations.extend(cooccurrence) | ||
|
|
||
| return ExtractionResult( | ||
| entities=final_entities, | ||
| relations=all_relations, | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,69 @@ | ||||||||||||||||||||||||||||||||||||||||||||||
| """Tests for KG relation extraction — co-occurrence based.""" | ||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| from brainlayer.pipeline.entity_extraction import ( | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity, | ||||||||||||||||||||||||||||||||||||||||||||||
| extract_cooccurrence_relations, | ||||||||||||||||||||||||||||||||||||||||||||||
| extract_entities_combined, | ||||||||||||||||||||||||||||||||||||||||||||||
| ) | ||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| class TestCooccurrenceRelations: | ||||||||||||||||||||||||||||||||||||||||||||||
| """Rule-based relation extraction from co-occurring entities.""" | ||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| def test_two_entities_produce_relation(self): | ||||||||||||||||||||||||||||||||||||||||||||||
| """Two entities in the same text should produce a co-occurrence relation.""" | ||||||||||||||||||||||||||||||||||||||||||||||
| entities = [ | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="BrainLayer", entity_type="project", start=0, end=10, confidence=0.9, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="SQLite", entity_type="technology", start=20, end=26, confidence=0.8, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||||||||||
| relations = extract_cooccurrence_relations(entities) | ||||||||||||||||||||||||||||||||||||||||||||||
| assert len(relations) >= 1 | ||||||||||||||||||||||||||||||||||||||||||||||
| rel = relations[0] | ||||||||||||||||||||||||||||||||||||||||||||||
| assert rel.source_text == "BrainLayer" | ||||||||||||||||||||||||||||||||||||||||||||||
| assert rel.target_text == "SQLite" | ||||||||||||||||||||||||||||||||||||||||||||||
| assert rel.relation_type == "co_occurs_with" | ||||||||||||||||||||||||||||||||||||||||||||||
| assert 0 < rel.confidence <= 1.0 | ||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+13
to
+25
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Test should verify the confidence calculation. The test checks 💚 Proposed enhancement assert 0 < rel.confidence <= 1.0
+ # min(0.9, 0.8) * 0.7 = 0.56
+ assert rel.confidence == pytest.approx(0.56)Add 🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| def test_no_relations_for_single_entity(self): | ||||||||||||||||||||||||||||||||||||||||||||||
| """A single entity can't have co-occurrence relations.""" | ||||||||||||||||||||||||||||||||||||||||||||||
| entities = [ | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="BrainLayer", entity_type="project", start=0, end=10, confidence=0.9, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||||||||||
| relations = extract_cooccurrence_relations(entities) | ||||||||||||||||||||||||||||||||||||||||||||||
| assert len(relations) == 0 | ||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| def test_no_duplicate_relations(self): | ||||||||||||||||||||||||||||||||||||||||||||||
| """Same entity pair should produce at most one relation.""" | ||||||||||||||||||||||||||||||||||||||||||||||
| entities = [ | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="Foo", entity_type="project", start=0, end=3, confidence=0.9, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="Bar", entity_type="technology", start=10, end=13, confidence=0.8, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||||||||||
| relations = extract_cooccurrence_relations(entities) | ||||||||||||||||||||||||||||||||||||||||||||||
| pairs = [(r.source_text, r.target_text) for r in relations] | ||||||||||||||||||||||||||||||||||||||||||||||
| assert len(pairs) == len(set(pairs)) | ||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+35
to
+43
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Test doesn't fully verify deduplication—only checks uniqueness of returned pairs. This test passes even if duplicates were never possible in the first place. To properly test deduplication, provide duplicate entity pairs (e.g., same text appearing twice in the input list) and verify only one relation is produced. 💚 Proposed fix to test actual deduplication logic def test_no_duplicate_relations(self):
- """Same entity pair should produce at most one relation."""
+ """Duplicate entity pairs in input should produce only one relation."""
entities = [
ExtractedEntity(text="Foo", entity_type="project", start=0, end=3, confidence=0.9, source="seed"),
ExtractedEntity(text="Bar", entity_type="technology", start=10, end=13, confidence=0.8, source="seed"),
+ # Duplicate mention of same pair
+ ExtractedEntity(text="Foo", entity_type="project", start=20, end=23, confidence=0.85, source="gliner"),
+ ExtractedEntity(text="Bar", entity_type="technology", start=30, end=33, confidence=0.75, source="gliner"),
]
relations = extract_cooccurrence_relations(entities)
- pairs = [(r.source_text, r.target_text) for r in relations]
- assert len(pairs) == len(set(pairs))
+ # Should only produce one Foo-Bar relation despite 4 possible pairings
+ foo_bar_relations = [r for r in relations if {r.source_text, r.target_text} == {"Foo", "Bar"}]
+ assert len(foo_bar_relations) == 1📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| def test_same_type_entities_not_related(self): | ||||||||||||||||||||||||||||||||||||||||||||||
| """Entities of the same type shouldn't get co-occurrence relations (too noisy).""" | ||||||||||||||||||||||||||||||||||||||||||||||
| entities = [ | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="Foo", entity_type="project", start=0, end=3, confidence=0.9, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ExtractedEntity(text="Bar", entity_type="project", start=10, end=13, confidence=0.8, source="seed"), | ||||||||||||||||||||||||||||||||||||||||||||||
| ] | ||||||||||||||||||||||||||||||||||||||||||||||
| relations = extract_cooccurrence_relations(entities) | ||||||||||||||||||||||||||||||||||||||||||||||
| assert len(relations) == 0 | ||||||||||||||||||||||||||||||||||||||||||||||
|
Comment on lines
+10
to
+52
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧹 Nitpick | 🔵 Trivial Consider adding edge case tests. Missing coverage for:
💚 Additional test casesdef test_empty_entities_returns_empty(self):
"""Empty input should return empty relations."""
relations = extract_cooccurrence_relations([])
assert relations == []
def test_three_different_types_produce_three_relations(self):
"""N entities of different types produce N*(N-1)/2 relations."""
entities = [
ExtractedEntity(text="A", entity_type="project", start=0, end=1, confidence=0.9, source="seed"),
ExtractedEntity(text="B", entity_type="technology", start=5, end=6, confidence=0.8, source="seed"),
ExtractedEntity(text="C", entity_type="person", start=10, end=11, confidence=0.7, source="seed"),
]
relations = extract_cooccurrence_relations(entities)
assert len(relations) == 3 # A-B, A-C, B-C🤖 Prompt for AI Agents |
||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| class TestCombinedExtractsRelations: | ||||||||||||||||||||||||||||||||||||||||||||||
| """extract_entities_combined should produce relations even without LLM.""" | ||||||||||||||||||||||||||||||||||||||||||||||
|
|
||||||||||||||||||||||||||||||||||||||||||||||
| def test_combined_produces_cooccurrence_relations(self): | ||||||||||||||||||||||||||||||||||||||||||||||
| """Combined extraction should include co-occurrence relations from entities.""" | ||||||||||||||||||||||||||||||||||||||||||||||
| seed = { | ||||||||||||||||||||||||||||||||||||||||||||||
| "project": ["BrainLayer"], | ||||||||||||||||||||||||||||||||||||||||||||||
| "technology": ["SQLite", "Python"], | ||||||||||||||||||||||||||||||||||||||||||||||
| } | ||||||||||||||||||||||||||||||||||||||||||||||
| text = "BrainLayer uses SQLite for storage and Python for the CLI." | ||||||||||||||||||||||||||||||||||||||||||||||
| result = extract_entities_combined(text, seed, llm_caller=None, use_llm=False) | ||||||||||||||||||||||||||||||||||||||||||||||
| assert len(result.entities) >= 2 | ||||||||||||||||||||||||||||||||||||||||||||||
| assert len(result.relations) >= 1 | ||||||||||||||||||||||||||||||||||||||||||||||
| rel_types = {r.relation_type for r in result.relations} | ||||||||||||||||||||||||||||||||||||||||||||||
| assert "co_occurs_with" in rel_types | ||||||||||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧹 Nitpick | 🔵 Trivial
Lexicographic ordering uses raw text casing—may cause inconsistent deduplication.
a.text < b.textcompares by Unicode codepoint, so"Apple"vs"apple"yields different orderings. If the same entity appears with different casing from different sources (e.g., "SQLite" vs "sqlite"), they won't dedupe correctly.Consider normalizing to lowercase for the pair key:
♻️ Proposed fix
📝 Committable suggestion
🤖 Prompt for AI Agents