From 623aa004c97840237f774857e552bccb31f90165 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 17:07:40 +0000
Subject: [PATCH 01/23] CU-869b44wz8: Create new abstraction layer for entity
 providing components (e.g NER and Linker)

---
 medcat-v2/medcat/components/types.py | 76 +++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/medcat-v2/medcat/components/types.py b/medcat-v2/medcat/components/types.py
index e2c28706d..a3aa549eb 100644
--- a/medcat-v2/medcat/components/types.py
+++ b/medcat-v2/medcat/components/types.py
@@ -1,6 +1,8 @@
 from typing import Optional, Protocol, Callable, runtime_checkable, Union
+from typing import Literal
 from typing_extensions import Self
 from enum import Enum, auto
+from abc import ABC, abstractmethod
 
 from medcat.utils.registry import Registry, MedCATRegistryException
 from medcat.tokenizing.tokens import MutableDocument, MutableEntity
@@ -69,7 +71,7 @@ def get_type(self) -> CoreComponentType:
         pass
 
 
-class AbstractCoreComponent(CoreComponent):
+class AbstractCoreComponent(ABC, CoreComponent):
     NAME_PREFIX = "core_"
 
     @property
@@ -80,6 +82,78 @@ def is_core(self) -> bool:
         return True
 
 
+class AbstractEntityProvidingComponent(AbstractCoreComponent):
+    """This is an abstract NER or linker component.
+
+    The class simplifies some things so that they don't have to be
+    re-implemented in each implementation.
+    """
+
+    def __init__(self,
+                 read_from_linked_ents: bool | Literal['auto'] = 'auto',
+                 write_to_linked_ents: bool | Literal['auto'] = 'auto'):
+        is_linker = self.get_type() == CoreComponentType.linking
+        if read_from_linked_ents == 'auto':
+            self._read_from_linked_ents = is_linker
+        else:
+            self._read_from_linked_ents = read_from_linked_ents
+        if write_to_linked_ents == 'auto':
+            self._write_to_linked_ents = is_linker
+        else:
+            self._write_to_linked_ents = write_to_linked_ents
+
+    # NOTE: These 2 are separated as methods to allow for custom behaviour
+    #       when deeriving from this class
+    def get_ents_in(self, doc: MutableDocument) -> list[MutableEntity] | None:
+        return doc.ner_ents.copy() if self._read_from_linked_ents else None
+
+    def set_ents(self, doc: MutableDocument, ents: list[MutableEntity]
+                 ) -> None:
+        if self._write_to_linked_ents:
+            self.set_linked_ents(doc, ents)
+        else:
+            self.set_ner_ents(doc, ents)
+
+    @classmethod
+    def set_ner_ents(cls, doc: MutableDocument, ents: list[MutableEntity]
+                     ) -> None:
+        doc.ner_ents.clear()
+        doc.ner_ents.extend(ents)
+
+    @classmethod
+    def set_linked_ents(cls, doc: MutableDocument, ents: list[MutableEntity]
+                        ) -> None:
+        doc.linked_ents.clear()
+        doc.linked_ents.extend(ents)
+
+    @abstractmethod
+    def predict_entities(self, doc: MutableDocument,
+                         ents: list[MutableEntity] | None = None
+                         ) -> list[MutableEntity]:
+        """Predict the relevant entities for the document.
+
+        This is meant to be used for the NER or the Linker component.
+        The idea is that this is the specific implementation only really
+        needs to implement this method for inference to work.
+
+        Args:
+            doc (MutableDocument): The document.
+            ents (list[MutableEntity] | None, optional): The entities to
+                consider (if any). If None, all possible entities in the
+                document are considered. Defaults to None.
+
+        Returns:
+            list[MutableEntity]: The predicted entities in document.
+        """
+        pass
+
+    def __call__(self, doc: MutableDocument) -> MutableDocument:
+        in_ents = self.get_ents_in(doc)
+        out_ents = self.predict_entities(doc, in_ents)
+        self.set_ents(doc, out_ents)
+        return doc
+
+
 @runtime_checkable
 class HashableComponet(Protocol):
 

From fc5ee2d6b7d4b734ca4f16c4f8d59ac9e8969736 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 17:08:07 +0000
Subject: [PATCH 02/23] CU-869b44wz8: Use new abstraction for linkers

---
 .../linking/context_based_linker.py           | 18 +++++++++-------
 .../components/linking/embedding_linker.py    | 21 ++++++++-----------
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/medcat-v2/medcat/components/linking/context_based_linker.py b/medcat-v2/medcat/components/linking/context_based_linker.py
index 860258bb5..f35aade52 100644
--- a/medcat-v2/medcat/components/linking/context_based_linker.py
+++ b/medcat-v2/medcat/components/linking/context_based_linker.py
@@ -2,7 +2,8 @@
 import logging
 from typing import Iterator, Optional, Union
 
-from medcat.components.types import CoreComponentType, AbstractCoreComponent
+from medcat.components.types import CoreComponentType
+from medcat.components.types import AbstractEntityProvidingComponent
 from medcat.tokenizing.tokens import MutableEntity, MutableDocument
 from medcat.components.linking.vector_context_model import (
     ContextModel, PerDocumentTokenCache)
@@ -18,7 +19,7 @@
 
 
 # class Linker(PipeRunner):
-class Linker(AbstractCoreComponent):
+class Linker(AbstractEntityProvidingComponent):
     """Link to a biomedical database.
 
     Args:
@@ -32,6 +33,7 @@ class Linker(AbstractCoreComponent):
 
     # Override
     def __init__(self, cdb: CDB, vocab: Vocab, config: Config) -> None:
+        super().__init__()
         self.cdb = cdb
         self.vocab = vocab
         self.config = config
@@ -193,7 +195,9 @@ def _inference(self, doc: MutableDocument) -> Iterator[MutableEntity]:
             yield from self._process_entity_inference(
                 doc, entity, per_doc_valid_token_cache)
 
-    def __call__(self, doc: MutableDocument) -> MutableDocument:
+    def predict_entities(self, doc: MutableDocument,
+                         ents: list[MutableEntity] | None = None
+                         ) -> list[MutableEntity]:
         # Reset main entities, will be recreated later
         doc.linked_ents.clear()
         cnf_l = self.config.components.linking
@@ -206,15 +210,15 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
         # cleared afterwards otherwise
         le = list(linked_entities)
 
-        doc.ner_ents.clear()
-        doc.ner_ents.extend(le)
-        create_main_ann(doc, self.config.general.show_nested_entities)
+        # doc.ner_ents.clear()
+        # doc.ner_ents.extend(le)
 
         # TODO - reintroduce pretty labels? and apply here?
 
         # TODO - reintroduce groups? and map here?
 
-        return doc
+        return create_main_ann(
+            doc, le, self.config.general.show_nested_entities)
 
     def train(self, cui: str,
               entity: MutableEntity,
diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py
index cd72e7652..3f2791b93 100644
--- a/medcat-v2/medcat/components/linking/embedding_linker.py
+++ b/medcat-v2/medcat/components/linking/embedding_linker.py
@@ -1,6 +1,7 @@
 from medcat.cdb import CDB
 from medcat.config.config import Config, ComponentConfig, EmbeddingLinking
-from medcat.components.types import CoreComponentType, AbstractCoreComponent
+from medcat.components.types import CoreComponentType
+from medcat.components.types import AbstractEntityProvidingComponent
 from medcat.tokenizing.tokens import MutableEntity, MutableDocument
 from medcat.tokenizing.tokenizers import BaseTokenizer
 from typing import Optional, Iterator, Set
@@ -27,7 +28,7 @@
 logger = logging.getLogger(__name__)
 
 
-class Linker(AbstractCoreComponent):
+class Linker(AbstractEntityProvidingComponent):
     name = "embedding_linker"
 
     def __init__(self, cdb: CDB, config: Config) -> None:
@@ -36,6 +37,7 @@ def __init__(self, cdb: CDB, config: Config) -> None:
             cdb (CDB): The concept database to use.
             config (Config): The base config.
         """
+        super().__init__()
         self.cdb = cdb
         self.config = config
         if not isinstance(config.components.linking, EmbeddingLinking):
@@ -92,7 +94,7 @@ def create_embeddings(self,
         using the chosen embedding model."""
         if embedding_model_name is None:
             embedding_model_name = self.cnf_l.embedding_model_name  # fallback
-            
+
         if max_length is not None and max_length != self.max_length:
             logger.info(
             "Updating max_length from %s to %s", self.max_length, max_length
@@ -548,10 +550,9 @@ def _pre_inference(self, doc: MutableDocument) -> tuple[list, list]:
             to_infer.append(entity)
         return le, to_infer
 
-    def __call__(self, doc: MutableDocument) -> MutableDocument:
-        # Reset main entities, will be recreated later
-        doc.linked_ents.clear()
-
+    def predict_entities(self, doc: MutableDocument,
+                         ents: list[MutableEntity] | None = None
+                         ) -> list[MutableEntity]:
         if self.cdb.is_dirty:
             logging.warning(
                 "CDB has been modified since last save/load. "
@@ -580,11 +581,7 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
             for entities in self._batch_data(to_infer, self.cnf_l.linking_batch_size):
                 le.extend(list(self._inference(doc, entities)))
 
-        doc.ner_ents.clear()
-        doc.ner_ents.extend(le)
-        create_main_ann(doc)
-
-        return doc
+        return create_main_ann(doc, le)
 
     @property
     def names_context_matrix(self):

From 03b3eade74588db680e4e5479db2a3fa614ffd6b Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 17:08:50 +0000
Subject: [PATCH 03/23] CU-869b44wz8: Use new abstraaction for DeID

---
 .../components/ner/trf/transformers_ner.py    | 50 +++++++++++--------
 1 file changed, 29 insertions(+), 21 deletions(-)

diff --git a/medcat-v2/medcat/components/ner/trf/transformers_ner.py b/medcat-v2/medcat/components/ner/trf/transformers_ner.py
index 459ca88b2..c919a32e1 100644
--- a/medcat-v2/medcat/components/ner/trf/transformers_ner.py
+++ b/medcat-v2/medcat/components/ner/trf/transformers_ner.py
@@ -26,7 +26,8 @@
     serialise, AvailableSerialisers, deserialise)
 from medcat.storage.serialisables import SerialisingStrategy
 from medcat.preprocessors.cleaners import NameDescriptor
-from medcat.components.types import CoreComponentType, AbstractCoreComponent
+from medcat.components.types import CoreComponentType
+from medcat.components.types import AbstractEntityProvidingComponent
 from medcat.vocab import Vocab
 from medcat.utils.defaults import COMPONENTS_FOLDER
 
@@ -44,7 +45,7 @@
 logger = logging.getLogger(__name__)
 
 
-class TransformersNER(AbstractCoreComponent):
+class TransformersNER(AbstractEntityProvidingComponent):
     name = 'transformers_ner'
     _def_serialiser = AvailableSerialisers.dill
 
@@ -53,6 +54,7 @@ def __init__(self, cdb: CDB,
                  component: 'TransformersNERComponent',
                  config: Optional[ConfigTransformersNER] = None,
                  training_arguments=None,) -> None:
+        super().__init__(write_to_linked_ents=True)
         self._component = component
 
     @classmethod
@@ -106,8 +108,13 @@ def save(self, folder: str, overwrite: bool = False) -> None:
                         folder, serialiser=self._def_serialiser,
                         overwrite=overwrite)
 
-    def __call__(self, doc: MutableDocument) -> MutableDocument:
-        return self._component(doc)
+    def predict_entities(self, doc: MutableDocument,
+                         ents: list[MutableEntity] | None = None
+                         ) -> list[MutableEntity]:
+        if ents:
+            raise ValueError(
+                "This method should ne be called with pre-defined entities")
+        return self._component(doc)[1]
 
     # for manual serialisability
 
@@ -687,7 +694,8 @@ def batch_generator(stream: Iterable[MutableDocument],
             yield docs
 
     def pipe(self, stream: Iterable[Union[MutableDocument, None]],
-             *args, **kwargs) -> Iterator[MutableDocument]:
+             *args, **kwargs) -> Iterator[tuple[MutableDocument,
+                                                list[MutableEntity]]]:
         """Process many documents at once.
 
         Args:
@@ -700,7 +708,8 @@ def pipe(self, stream: Iterable[Union[MutableDocument, None]],
             Doc: The same document.
 
         Returns:
-            Iterator[MutableDocument]: If the stream is None or empty.
+            Iterator[tuple[MutableDocument, list[MutableEntity]]]: The stream
+                of documents and entities
         """
         # Just in case
         if stream is None or not stream:
@@ -710,11 +719,11 @@ def pipe(self, stream: Iterable[Union[MutableDocument, None]],
         batch_size_chars = self.config.general.pipe_batch_size_in_chars
         yield from self._process(stream, batch_size_chars)  # type: ignore
 
-    def _process_doc(self, doc: MutableDocument):
+    def _process_doc(self, doc: MutableDocument) -> list[MutableEntity]:
         aggr_strat = self.config.general.ner_aggregation_strategy
         res = self.ner_pipe(doc.base.text,
                             aggregation_strategy=aggr_strat)
-        doc.ner_ents = []  # type: ignore
+        ents: list[MutableEntity] = []
         for r in res:
             inds = []
             for ind, word in enumerate(doc):
@@ -732,15 +741,16 @@ def _process_doc(self, doc: MutableDocument):
                 label=r['entity_group'])
             entity.cui = r['entity_group']
             entity.context_similarity = r['score']
-            entity.id = len(doc.ner_ents)
+            entity.id = len(ents)
             entity.confidence = r['score']
 
-            doc.ner_ents.append(entity)
-        create_main_ann(doc)
+            ents.append(entity)
+        return create_main_ann(doc, ents)
 
     def _process(self,
                  stream: Iterable[Union[MutableDocument, None]],
-                 batch_size_chars: int) -> Iterator[Optional[MutableDocument]]:
+                 batch_size_chars: int) -> Iterator[
+                     tuple[MutableDocument, list[MutableEntity]]]:
         if not hasattr(self, "ner_pipe"):
             self.create_eval_pipeline()
         for docs in self.batch_generator(
@@ -748,11 +758,12 @@ def _process(self,
             # For now we will process the documents one by one, should be
             # improved in the future to use batching
             for doc in docs:
-                self._process_doc(doc)
-            yield from docs
+                ents = self._process_doc(doc)
+                yield doc, ents
 
     # Override
-    def __call__(self, doc: MutableDocument) -> MutableDocument:
+    def __call__(self, doc: MutableDocument,
+                 ) -> tuple[MutableDocument, list[MutableEntity]]:
         """Process one document, used in the spacy pipeline for sequential
         document processing.
 
@@ -761,13 +772,10 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
                 A spacy document
 
         Returns:
-            Doc: The same spacy document.
+            tuple[MutableDocument, list[MutableEntity]]: The document and
+                the corresponding entities.
         """
-
-        # Just call the pipe method
-        doc = next(self.pipe(iter([doc])))
-
-        return doc
+        return next(self.pipe(iter([doc])))
 
 
 # NOTE: Only needed for datasets backwards compatibility

From 15a5ad86ef00f803f078147855a98039cb5ee177 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 17:09:26 +0000
Subject: [PATCH 04/23] CU-869b44wz8: Fix setting of linker entities - do it
 all in one place

---
 medcat-v2/medcat/utils/postprocessing.py | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/medcat-v2/medcat/utils/postprocessing.py b/medcat-v2/medcat/utils/postprocessing.py
index 3e313825f..dfd2582d4 100644
--- a/medcat-v2/medcat/utils/postprocessing.py
+++ b/medcat-v2/medcat/utils/postprocessing.py
@@ -4,26 +4,33 @@
 # NOTE: the following used (in medcat v1) check tuis
 #       but they were never passed to the method so
 #       I've omitted it now
-def create_main_ann(doc: MutableDocument, show_nested_entities: bool = False) -> None:
+def create_main_ann(doc: MutableDocument,
+                    linked_ents: list[MutableEntity],
+                    show_nested_entities: bool = False
+                    ) -> list[MutableEntity]:
     """Creates annotation in the spacy ents list
     from all the annotations for this document.
 
     Args:
         doc (Doc): Spacy document.
+        linked_ents (list[MutableEntity]): The linked entities.
         show_nested_entities (bool): Whether to keep overlapping/nested entities.
             If True, keeps all entities. If False, filters overlapping entities
             keeping only the longest matches. Defaults to False.
+
+    Returns:
+        list[MutbaleEntity]: The resulting entities
     """
     if show_nested_entities:
-        doc.linked_ents = sorted(list(doc.linked_ents) + doc.ner_ents,  # type: ignore
-                                 key=lambda ent: ent.base.start_char_index)
+        return sorted(list(linked_ents),
+                      key=lambda ent: ent.base.start_char_index)
     else:
         # Filter overlapping entities using token indices (not object identity)
-        doc.ner_ents.sort(key=lambda x: len(x.base.text), reverse=True)
+        linked_ents.sort(key=lambda x: len(x.base.text), reverse=True)
         tkns_in = set()  # Set of token indices
         main_anns: list[MutableEntity] = []
 
-        for ent in doc.ner_ents:
+        for ent in linked_ents:
             to_add = True
             for tkn in ent:
                 if tkn.base.index in tkns_in:  # Use token index instead
@@ -34,7 +41,5 @@ def create_main_ann(doc: MutableDocument, show_nested_entities: bool = False) ->
                     tkns_in.add(tkn.base.index)
                 main_anns.append(ent)
 
-        # unclear why the original doc.linked_ents needs to be preserved here.
-        doc.linked_ents = sorted(list(doc.linked_ents) + main_anns,  # type: ignore
-                                 key=lambda ent: ent.base.start_char_index)
-
+        return sorted(main_anns,
+                      key=lambda ent: ent.base.start_char_index)

From 8a91f9f5ae284af63983ea256e4912304e67b946 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 17:09:40 +0000
Subject: [PATCH 05/23] Fix NER tests

---
 medcat-v2/tests/utils/ner/test_deid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/medcat-v2/tests/utils/ner/test_deid.py b/medcat-v2/tests/utils/ner/test_deid.py
index df684d9de..35289083e 100644
--- a/medcat-v2/tests/utils/ner/test_deid.py
+++ b/medcat-v2/tests/utils/ner/test_deid.py
@@ -215,7 +215,7 @@ def test_model_works_deid_text(self):
     def test_model_works_dunder_call(self):
         anon_doc = self.deid_model(input_text)
         self.assertIsInstance(anon_doc, runtime_checkable(MutableDocument))
-        self.assertTrue(anon_doc.ner_ents)
+        self.assertTrue(anon_doc.linked_ents)
 
     def test_model_works_deid_text_redact(self):
         anon_text = self.deid_model.deid_text(input_text, redact=True)

From 7bc4dd54013cbc1a70c3597a446ad2c9b90f5746 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 17:09:58 +0000
Subject: [PATCH 06/23] Fix postporcesing tests

---
 medcat-v2/tests/utils/test_postprocessing.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/medcat-v2/tests/utils/test_postprocessing.py b/medcat-v2/tests/utils/test_postprocessing.py
index 12b56ed9f..f99dbfe05 100644
--- a/medcat-v2/tests/utils/test_postprocessing.py
+++ b/medcat-v2/tests/utils/test_postprocessing.py
@@ -3,6 +3,8 @@
 from typing import List
 
 from medcat.utils.postprocessing import create_main_ann
+from medcat.components.types import AbstractEntityProvidingComponent
+
 
 def create_mock_entity(text: str, start_char: int, end_char: int, cui: str = None, tokens: List = None):
     """Helper function to create a mock entity with minimal setup."""
@@ -60,7 +62,8 @@ def test_show_nested_entities_false_should_filter_overlaps(self):
 
         self.doc.ner_ents = [self.entity_chest_pain, self.entity_chest, self.entity_pain]
 
-        create_main_ann(self.doc, show_nested_entities=False)
+        AbstractEntityProvidingComponent.set_linked_ents(
+            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=False))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 
@@ -75,7 +78,8 @@ def test_show_nested_entities_true_should_keep_overlaps(self):
 
         self.doc.ner_ents = [self.entity_chest_pain, self.entity_chest, self.entity_pain]
 
-        create_main_ann(self.doc, show_nested_entities=True)
+        AbstractEntityProvidingComponent.set_linked_ents(
+            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=True))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 
@@ -96,7 +100,8 @@ def test_non_overlapping_entities_always_kept(self):
         self.doc.ner_ents = [self.entity_chest_pain, entity_dm]
 
         # Test with show_nested_entities=False
-        create_main_ann(self.doc, show_nested_entities=False)
+        AbstractEntityProvidingComponent.set_linked_ents(
+            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=False))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 
@@ -130,7 +135,8 @@ def test_same_concept_multiple_locations(self):
         # Test with show_nested_entities=False
         self.doc.ner_ents = [entity_chest_pain_1, entity_chest_pain_2, entity_chest_1, entity_pain_1_overlap]
 
-        create_main_ann(self.doc, show_nested_entities=False)
+        AbstractEntityProvidingComponent.set_linked_ents(
+            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=False))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
         entity_positions = [(ent.base.text, ent.base.start_char_index, ent.base.end_char_index)
@@ -170,7 +176,8 @@ def test_same_concept_multiple_locations_with_nested_true(self):
         # Test with show_nested_entities=True
         self.doc.ner_ents = [entity_chest_pain_1, entity_chest_pain_2, entity_chest_1, entity_pain_1_overlap]
 
-        create_main_ann(self.doc, show_nested_entities=True)
+        AbstractEntityProvidingComponent.set_linked_ents(
+            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=True))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 

From bcd7f18d8155ae800c805093cfc63875a89bee16 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 22:32:18 +0000
Subject: [PATCH 07/23] CU-869b44wz8: Update NER components with new
 abstraction

---
 .../medcat/components/ner/dict_based_ner.py   | 29 +++++++++-----
 .../components/ner/vocab_based_annotator.py   |  2 -
 .../medcat/components/ner/vocab_based_ner.py  | 38 +++++++++++++------
 3 files changed, 46 insertions(+), 23 deletions(-)

diff --git a/medcat-v2/medcat/components/ner/dict_based_ner.py b/medcat-v2/medcat/components/ner/dict_based_ner.py
index 463c62201..84289530f 100644
--- a/medcat-v2/medcat/components/ner/dict_based_ner.py
+++ b/medcat-v2/medcat/components/ner/dict_based_ner.py
@@ -1,8 +1,9 @@
 from typing import Optional
 
 import logging
-from medcat.tokenizing.tokens import MutableDocument
-from medcat.components.types import CoreComponentType, AbstractCoreComponent
+from medcat.tokenizing.tokens import MutableDocument, MutableEntity
+from medcat.components.types import CoreComponentType
+from medcat.components.types import AbstractEntityProvidingComponent
 from medcat.components.ner.vocab_based_annotator import maybe_annotate_name
 from medcat.utils.import_utils import ensure_optional_extras_installed
 from medcat.tokenizing.tokenizers import BaseTokenizer
@@ -24,7 +25,7 @@
 logger = logging.getLogger(__name__)
 
 
-class NER(AbstractCoreComponent):
+class NER(AbstractEntityProvidingComponent):
     name = 'cat_dict_ner'
 
     def __init__(self, tokenizer: BaseTokenizer,
@@ -60,7 +61,9 @@ def _rebuild_automaton(self):
     def get_type(self) -> CoreComponentType:
         return CoreComponentType.ner
 
-    def __call__(self, doc: MutableDocument) -> MutableDocument:
+    def predict_entities(self, doc: MutableDocument,
+                         ents: list[MutableEntity] | None = None
+                         ) -> list[MutableEntity]:
         """Detect candidates for concepts - linker will then be able
         to do the rest. It adds `entities` to the doc.entities and each
         entity can have the entity.link_candidates - that the linker
@@ -69,15 +72,20 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
         Args:
             doc (MutableDocument):
                 Spacy document to be annotated with named entities.
+            ents (list[MutableEntity] | None):
+                The entities given. This should be None.
 
         Returns:
-            doc (MutableDocument):
-                Spacy document with detected entities.
+            list[MutableEntity]:
+                The NER'ed entities.
         """
+        if ents is not None:
+            ValueError(f"Unexpected entities sent to NER: {ents}")
         if self.cdb.has_changed_names:
             self.cdb._reset_subnames()
             self._rebuild_automaton()
         text = doc.base.text.lower()
+        ner_ents: list[MutableEntity] = []
         for end_idx, raw_name in self.automaton.iter(text):
             start_idx = end_idx - len(raw_name) + 1
             cur_tokens = doc.get_tokens(start_idx, end_idx)
@@ -96,9 +104,12 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
                 continue
             preprocessed_name = raw_name.replace(
                 ' ', self.config.general.separator)
-            maybe_annotate_name(self.tokenizer, preprocessed_name, cur_tokens,
-                                doc, self.cdb, self.config)
-        return doc
+            ent = maybe_annotate_name(
+                self.tokenizer, preprocessed_name, cur_tokens,
+                doc, self.cdb, self.config)
+            if ent:
+                ner_ents.append(ent)
+        return ner_ents
 
     @classmethod
     def create_new_component(
diff --git a/medcat-v2/medcat/components/ner/vocab_based_annotator.py b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
index 6e65651ad..d15d9ca2b 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_annotator.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
@@ -26,8 +26,6 @@ def annotate_name(tokenizer: BaseTokenizer, name: str,
     entity.link_candidates = list(cdb.name2info[name]['per_cui_status'])
     entity.id = len(doc.ner_ents)
     entity.confidence = -1  # This does not calculate confidence
-    # Append the entity to the document
-    doc.ner_ents.append(entity)
 
     # Not necessary, but why not
     logger.debug("NER detected an entity.\n\tDetected name: %s" +
diff --git a/medcat-v2/medcat/components/ner/vocab_based_ner.py b/medcat-v2/medcat/components/ner/vocab_based_ner.py
index afd12e41e..173f5ffc0 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_ner.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_ner.py
@@ -1,7 +1,7 @@
 from typing import Optional
 
 import logging
-from medcat.tokenizing.tokens import MutableDocument
+from medcat.tokenizing.tokens import MutableDocument, MutableEntity
 from medcat.components.types import CoreComponentType, AbstractCoreComponent
 from medcat.components.ner.vocab_based_annotator import maybe_annotate_name
 from medcat.tokenizing.tokenizers import BaseTokenizer
@@ -18,6 +18,7 @@ class NER(AbstractCoreComponent):
 
     def __init__(self, tokenizer: BaseTokenizer,
                  cdb: CDB) -> None:
+        super().__init__()
         self.tokenizer = tokenizer
         self.cdb = cdb
         self.config = self.cdb.config
@@ -25,7 +26,9 @@ def __init__(self, tokenizer: BaseTokenizer,
     def get_type(self) -> CoreComponentType:
         return CoreComponentType.ner
 
-    def __call__(self, doc: MutableDocument) -> MutableDocument:
+    def predict_entities(self, doc: MutableDocument,
+                         ents: list[MutableEntity] | None = None
+                         ) -> list[MutableEntity]:
         """Detect candidates for concepts - linker will then be able
         to do the rest. It adds `entities` to the doc.entities and each
         entity can have the entity.link_candidates - that the linker
@@ -34,15 +37,18 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
         Args:
             doc (MutableDocument):
                 Spacy document to be annotated with named entities.
+            ents (list[MutableEntity] | None):
+                The entities given. This should be None.
 
         Returns:
-            doc (MutableDocument):
-                Spacy document with detected entities.
+            list[MutableEntity]:
+                The NER'ed entities.
         """
         max_skip_tokens = self.config.components.ner.max_skip_tokens
         _sep = self.config.general.separator
         # Just take the tokens we need
         _doc = [tkn for tkn in doc if not tkn.to_skip]
+        ner_ents: list[MutableEntity] = []
         for i, tkn in enumerate(_doc):
             tkn = _doc[i]
             tkns = [tkn]
@@ -60,8 +66,11 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
                     break
             # if name is in CDB
             if name in self.cdb.name2info and not tkn.base.is_stop:
-                maybe_annotate_name(self.tokenizer, name, tkns, doc,
-                                    self.cdb, self.config)
+                ent = maybe_annotate_name(
+                    self.tokenizer, name, tkns, doc,
+                    self.cdb, self.config)
+                if ent:
+                    ner_ents.append(ent)
             # if name is not a subname CDB (explicitly)
             if not name:
                 # There has to be at least something appended to the name
@@ -97,16 +106,21 @@ def __call__(self, doc: MutableDocument) -> MutableDocument:
 
                 if name_changed:
                     if name in self.cdb.name2info:
-                        maybe_annotate_name(self.tokenizer, name, tkns, doc,
-                                            self.cdb, self.config)
+                        ent = maybe_annotate_name(
+                            self.tokenizer, name, tkns, doc,
+                            self.cdb, self.config)
+                        if ent:
+                            ner_ents.append(ent)
                 elif name_reverse is not None:
                     if name_reverse in self.cdb.name2info:
-                        maybe_annotate_name(self.tokenizer, name_reverse, tkns,
-                                            doc, self.cdb, self.config)
+                        ent = maybe_annotate_name(
+                            self.tokenizer, name_reverse, tkns,
+                            doc, self.cdb, self.config)
+                        if ent:
+                            ner_ents.append(ent)
                 else:
                     break
-
-        return doc
+        return ner_ents
 
     @classmethod
     def create_new_component(

From 19f6db8d019e14efdc3cef1f992d9569217f3954 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 22:54:20 +0000
Subject: [PATCH 08/23] CU-869b44wz8: Fix issue with wrong base class

---
 medcat-v2/medcat/components/ner/vocab_based_ner.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/medcat-v2/medcat/components/ner/vocab_based_ner.py b/medcat-v2/medcat/components/ner/vocab_based_ner.py
index 173f5ffc0..f3f940d93 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_ner.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_ner.py
@@ -2,7 +2,8 @@
 
 import logging
 from medcat.tokenizing.tokens import MutableDocument, MutableEntity
-from medcat.components.types import CoreComponentType, AbstractCoreComponent
+from medcat.components.types import CoreComponentType
+from medcat.components.types import AbstractEntityProvidingComponent
 from medcat.components.ner.vocab_based_annotator import maybe_annotate_name
 from medcat.tokenizing.tokenizers import BaseTokenizer
 from medcat.vocab import Vocab
@@ -13,7 +14,7 @@
 logger = logging.getLogger(__name__)
 
 
-class NER(AbstractCoreComponent):
+class NER(AbstractEntityProvidingComponent):
     name = 'cat_ner'
 
     def __init__(self, tokenizer: BaseTokenizer,

From 6d0612c5f4671456c0d1c2cfb786d8d9db6729f7 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 23:00:34 +0000
Subject: [PATCH 09/23] CU-869b44wz8: Add missing base class init call

---
 medcat-v2/medcat/components/ner/dict_based_ner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/medcat-v2/medcat/components/ner/dict_based_ner.py b/medcat-v2/medcat/components/ner/dict_based_ner.py
index 84289530f..0eb15339a 100644
--- a/medcat-v2/medcat/components/ner/dict_based_ner.py
+++ b/medcat-v2/medcat/components/ner/dict_based_ner.py
@@ -30,6 +30,7 @@ class NER(AbstractEntityProvidingComponent):
 
     def __init__(self, tokenizer: BaseTokenizer,
                  cdb: CDB) -> None:
+        self.super().__init__()
         self.tokenizer = tokenizer
         self.cdb = cdb
         self.config = self.cdb.config

From 3747dcac0c6545a71e1099f32ab30e7eea5a1189 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 23:53:05 +0000
Subject: [PATCH 10/23] CU-869b44wz8: Fix typo

---
 medcat-v2/medcat/components/ner/dict_based_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/medcat-v2/medcat/components/ner/dict_based_ner.py b/medcat-v2/medcat/components/ner/dict_based_ner.py
index 0eb15339a..9b84c6f8a 100644
--- a/medcat-v2/medcat/components/ner/dict_based_ner.py
+++ b/medcat-v2/medcat/components/ner/dict_based_ner.py
@@ -30,7 +30,7 @@ class NER(AbstractEntityProvidingComponent):
 
     def __init__(self, tokenizer: BaseTokenizer,
                  cdb: CDB) -> None:
-        self.super().__init__()
+        super().__init__()
         self.tokenizer = tokenizer
         self.cdb = cdb
         self.config = self.cdb.config

From a9fa26ae0c419ed6f7aa6c2071b1a2865fdb70b0 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 23:56:41 +0000
Subject: [PATCH 11/23] CU-869b44wz8: Avoid implicit use of doc.ner_ents

---
 .../linking/context_based_linker.py           | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/medcat-v2/medcat/components/linking/context_based_linker.py b/medcat-v2/medcat/components/linking/context_based_linker.py
index f35aade52..7a5778d52 100644
--- a/medcat-v2/medcat/components/linking/context_based_linker.py
+++ b/medcat-v2/medcat/components/linking/context_based_linker.py
@@ -107,9 +107,11 @@ def _process_entity_train(self, doc: MutableDocument,
                 entity.context_similarity = 1
                 yield entity
 
-    def _train_on_doc(self, doc: MutableDocument) -> Iterator[MutableEntity]:
+    def _train_on_doc(self, doc: MutableDocument,
+                      ner_ents: list[MutableEntity]
+                      ) -> Iterator[MutableEntity]:
         # Run training
-        for entity in doc.ner_ents:
+        for entity in ner_ents:
             yield from self._process_entity_train(
                 doc, entity, PerDocumentTokenCache())
 
@@ -188,9 +190,11 @@ def _process_entity_inference(
             entity.context_similarity = context_similarity
             yield entity
 
-    def _inference(self, doc: MutableDocument) -> Iterator[MutableEntity]:
+    def _inference(self, doc: MutableDocument,
+                   ner_ents: list[MutableEntity]
+                   ) -> Iterator[MutableEntity]:
         per_doc_valid_token_cache = PerDocumentTokenCache()
-        for entity in doc.ner_ents:
+        for entity in ner_ents:
             logger.debug("Linker started with entity: %s", entity.base.text)
             yield from self._process_entity_inference(
                 doc, entity, per_doc_valid_token_cache)
@@ -199,13 +203,15 @@ def predict_entities(self, doc: MutableDocument,
                          ents: list[MutableEntity] | None = None
                          ) -> list[MutableEntity]:
         # Reset main entities, will be recreated later
-        doc.linked_ents.clear()
         cnf_l = self.config.components.linking
 
+        if ents is None:
+            raise ValueError("Need to have NER'ed entities provided")
+
         if cnf_l.train:
-            linked_entities = self._train_on_doc(doc)
+            linked_entities = self._train_on_doc(doc, ents)
         else:
-            linked_entities = self._inference(doc)
+            linked_entities = self._inference(doc, ents)
         # evaluating generator here because the `all_ents` list gets
         # cleared afterwards otherwise
         le = list(linked_entities)

From c4583e0c33709683896b1d0be2ad2e9ddefbfdab Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 10 Nov 2025 23:59:43 +0000
Subject: [PATCH 12/23] CU-869b44wz8: Fix issue with entity IDs

---
 medcat-v2/medcat/components/ner/dict_based_ner.py        | 2 +-
 medcat-v2/medcat/components/ner/vocab_based_annotator.py | 7 +++++--
 medcat-v2/medcat/components/ner/vocab_based_ner.py       | 6 +++---
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/medcat-v2/medcat/components/ner/dict_based_ner.py b/medcat-v2/medcat/components/ner/dict_based_ner.py
index 9b84c6f8a..eefec3070 100644
--- a/medcat-v2/medcat/components/ner/dict_based_ner.py
+++ b/medcat-v2/medcat/components/ner/dict_based_ner.py
@@ -107,7 +107,7 @@ def predict_entities(self, doc: MutableDocument,
                 ' ', self.config.general.separator)
             ent = maybe_annotate_name(
                 self.tokenizer, preprocessed_name, cur_tokens,
-                doc, self.cdb, self.config)
+                doc, self.cdb, self.config, len(ner_ents))
             if ent:
                 ner_ents.append(ent)
         return ner_ents
diff --git a/medcat-v2/medcat/components/ner/vocab_based_annotator.py b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
index d15d9ca2b..4478a3432 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_annotator.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
@@ -16,6 +16,7 @@
 def annotate_name(tokenizer: BaseTokenizer, name: str,
                   tkns: list[MutableToken],
                   doc: MutableDocument, cdb: CDB,
+                  cur_id: int,
                   label: str):
     entity: MutableEntity = tokenizer.create_entity(
         doc, tkns[0].base.index, tkns[-1].base.index + 1, label=label)
@@ -24,7 +25,7 @@ def annotate_name(tokenizer: BaseTokenizer, name: str,
     # All standard name entity recognition models will not set this.
     entity.detected_name = name
     entity.link_candidates = list(cdb.name2info[name]['per_cui_status'])
-    entity.id = len(doc.ner_ents)
+    entity.id = cur_id
     entity.confidence = -1  # This does not calculate confidence
 
     # Not necessary, but why not
@@ -37,6 +38,7 @@ def annotate_name(tokenizer: BaseTokenizer, name: str,
 def maybe_annotate_name(tokenizer: BaseTokenizer, name: str,
                         tkns: list[MutableToken],
                         doc: MutableDocument, cdb: CDB, config: Config,
+                        cur_id: int,
                         label: str = 'concept'
                         ) -> Optional[MutableEntity]:
     """Given a name it will check should it be annotated based on config rules.
@@ -83,6 +85,7 @@ def maybe_annotate_name(tokenizer: BaseTokenizer, name: str,
         if (len(name) >= config.components.ner.upper_case_limit_len or
                 (len(tkns) == 1 and tkns[0].base.is_upper)):
             # Everything is fine, mark name
-            return annotate_name(tokenizer, name, tkns, doc, cdb, label)
+            return annotate_name(
+                tokenizer, name, tkns, doc, cdb, cur_id, label)
 
     return None
diff --git a/medcat-v2/medcat/components/ner/vocab_based_ner.py b/medcat-v2/medcat/components/ner/vocab_based_ner.py
index f3f940d93..d214714a0 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_ner.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_ner.py
@@ -69,7 +69,7 @@ def predict_entities(self, doc: MutableDocument,
             if name in self.cdb.name2info and not tkn.base.is_stop:
                 ent = maybe_annotate_name(
                     self.tokenizer, name, tkns, doc,
-                    self.cdb, self.config)
+                    self.cdb, self.config, len(ner_ents))
                 if ent:
                     ner_ents.append(ent)
             # if name is not a subname CDB (explicitly)
@@ -109,14 +109,14 @@ def predict_entities(self, doc: MutableDocument,
                     if name in self.cdb.name2info:
                         ent = maybe_annotate_name(
                             self.tokenizer, name, tkns, doc,
-                            self.cdb, self.config)
+                            self.cdb, self.config, len(ner_ents))
                         if ent:
                             ner_ents.append(ent)
                 elif name_reverse is not None:
                     if name_reverse in self.cdb.name2info:
                         ent = maybe_annotate_name(
                             self.tokenizer, name_reverse, tkns,
-                            doc, self.cdb, self.config)
+                            doc, self.cdb, self.config, len(ner_ents))
                         if ent:
                             ner_ents.append(ent)
                 else:

From 4926d432627d55292c9f5789912cc0db93a764e5 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 11 Nov 2025 10:07:26 +0000
Subject: [PATCH 13/23] Update tutorial with up to date example

---
 .../advanced/2._Create_and_use_component.ipynb     | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb b/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
index 24a2bb7c3..da26cbe57 100644
--- a/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
+++ b/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
@@ -73,6 +73,7 @@
     "    # NOTE: NEED TO IMPLEMENT\n",
     "    # you can specify whatever init args as long as you define them above\n",
     "    def __init__(self, tokenizer: BaseTokenizer, cdb: CDB):\n",
+    "        super().__init__()\n",
     "        self.tokenizer = tokenizer\n",
     "        self.cdb = cdb\n",
     "\n",
@@ -90,7 +91,9 @@
     "        return CoreComponentType.ner\n",
     "\n",
     "    # NOTE: NEED TO IMPLEMENT\n",
-    "    def __call__(self, doc: MutableDocument) -> MutableDocument:\n",
+    "    def predict_entities(self, doc: MutableDocument,\n",
+    "                         ents: list[MutableEntity] | None = None\n",
+    "                         ) -> list[MutableEntity]:\n",
     "        \"\"\"Detect candidates for concepts - linker will then be able\n",
     "        to do the rest. It adds `entities` to the doc.entities and each\n",
     "        entity can have the entity.link_candidates - that the linker\n",
@@ -99,6 +102,8 @@
     "        Args:\n",
     "            doc (MutableDocument):\n",
     "                Spacy document to be annotated with named entities.\n",
+    "            ents list[MutableEntity] | None = None:\n",
+    "                The entties to use. None expected here.\n",
     "\n",
     "        Returns:\n",
     "            doc (MutableDocument):\n",
@@ -113,6 +118,7 @@
     "                           for start in start_tkn_indices]\n",
     "        choose_from = list(self.cdb.name2info.keys())\n",
     "        chosen_name = [random.choice(choose_from) for _ in start_tkn_indices]\n",
+    "        ner_ents: list[MutableEntity] = []\n",
     "        for tkn_start_idx, tkn_end_idx, linked_name in zip(start_tkn_indices, end_tkn_indices, chosen_name):\n",
     "            char_start_idx = doc[tkn_start_idx].base.char_index\n",
     "            # NOTE: can only do this since we're never selecting the last token\n",
@@ -123,8 +129,10 @@
     "            #       safe to assume that these are all lists of tokens\n",
     "\n",
     "            # this checks the config (i.e length and stuff) and then annotes\n",
-    "            maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config)\n",
-    "        return doc\n",
+    "            ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config)\n",
+    "            if ent:\n",
+    "                ner_ents.append(ent)\n",
+    "        return ner_ents\n",
     "\n"
    ]
   },

From 7e202a48b63f81237310e57edc2e80ecd691ced0 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 11 Nov 2025 10:11:33 +0000
Subject: [PATCH 14/23] CU-869b44wz8: Fix issue with wrong base class in
 tutorial

---
 .../notebooks/advanced/2._Create_and_use_component.ipynb     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb b/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
index da26cbe57..cd1471187 100644
--- a/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
+++ b/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
@@ -38,7 +38,8 @@
     "from medcat.cdb.cdb import CDB\n",
     "from medcat.config.config import Ner\n",
     "# for the component itself\n",
-    "from medcat.components.types import AbstractCoreComponent, CoreComponentType\n",
+    "from medcat.components.types import CoreComponentType\n",
+    "from medcat.components.types import AbstractEntityProvidingComponent\n",
     "from medcat.tokenizing.tokens import MutableDocument, MutableEntity\n",
     "from medcat.components.ner.vocab_based_annotator import maybe_annotate_name\n",
     "\n",
@@ -59,7 +60,7 @@
     "        return min(max(self.min, num), self.max)\n",
     "\n",
     "\n",
-    "class RandomNER(AbstractCoreComponent):\n",
+    "class RandomNER(AbstractEntityProvidingComponent):\n",
     "    # NOTE: NEED TO IMPLEMENT\n",
     "    name = \"RANDOM_NER\"\n",
     "\n",

From 0b0d6985b2cb1c85925fe8abc3c782d4852106db Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 11 Nov 2025 10:22:20 +0000
Subject: [PATCH 15/23] CU-869b44wz8: Reinstate old signature of
 create_main_ann and use new one

---
 .../components/linking/context_based_linker.py     |  4 ++--
 .../medcat/components/linking/embedding_linker.py  |  4 ++--
 .../medcat/components/ner/trf/transformers_ner.py  |  4 ++--
 medcat-v2/medcat/utils/postprocessing.py           | 14 ++++++++++----
 medcat-v2/tests/utils/test_postprocessing.py       | 12 ++++++------
 5 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/medcat-v2/medcat/components/linking/context_based_linker.py b/medcat-v2/medcat/components/linking/context_based_linker.py
index 7a5778d52..f171a931b 100644
--- a/medcat-v2/medcat/components/linking/context_based_linker.py
+++ b/medcat-v2/medcat/components/linking/context_based_linker.py
@@ -11,7 +11,7 @@
 from medcat.vocab import Vocab
 from medcat.config.config import Config, ComponentConfig
 from medcat.utils.defaults import StatusTypes as ST
-from medcat.utils.postprocessing import create_main_ann
+from medcat.utils.postprocessing import filter_linked_annotations
 from medcat.tokenizing.tokenizers import BaseTokenizer
 
 
@@ -223,7 +223,7 @@ def predict_entities(self, doc: MutableDocument,
 
         # TODO - reintroduce groups? and map here?
 
-        return create_main_ann(
+        return filter_linked_annotations(
             doc, le, self.config.general.show_nested_entities)
 
     def train(self, cui: str,
diff --git a/medcat-v2/medcat/components/linking/embedding_linker.py b/medcat-v2/medcat/components/linking/embedding_linker.py
index 3f2791b93..c13a48fb6 100644
--- a/medcat-v2/medcat/components/linking/embedding_linker.py
+++ b/medcat-v2/medcat/components/linking/embedding_linker.py
@@ -6,7 +6,7 @@
 from medcat.tokenizing.tokenizers import BaseTokenizer
 from typing import Optional, Iterator, Set
 from medcat.vocab import Vocab
-from medcat.utils.postprocessing import create_main_ann
+from medcat.utils.postprocessing import filter_linked_annotations
 from tqdm import tqdm
 from collections import defaultdict
 import logging
@@ -581,7 +581,7 @@ def predict_entities(self, doc: MutableDocument,
             for entities in self._batch_data(to_infer, self.cnf_l.linking_batch_size):
                 le.extend(list(self._inference(doc, entities)))
 
-        return create_main_ann(doc, le)
+        return filter_linked_annotations(doc, le)
 
     @property
     def names_context_matrix(self):
diff --git a/medcat-v2/medcat/components/ner/trf/transformers_ner.py b/medcat-v2/medcat/components/ner/trf/transformers_ner.py
index c919a32e1..513c50ec8 100644
--- a/medcat-v2/medcat/components/ner/trf/transformers_ner.py
+++ b/medcat-v2/medcat/components/ner/trf/transformers_ner.py
@@ -12,7 +12,7 @@
 from medcat.cdb.cdb import CDB
 from medcat.components.addons.meta_cat.ml_utils import set_all_seeds
 from medcat.utils.ner import transformers_ner
-from medcat.utils.postprocessing import create_main_ann
+from medcat.utils.postprocessing import filter_linked_annotations
 from medcat.utils.hasher import Hasher
 from medcat.config.config_transformers_ner import ConfigTransformersNER
 from medcat.config.config import ComponentConfig
@@ -745,7 +745,7 @@ def _process_doc(self, doc: MutableDocument) -> list[MutableEntity]:
             entity.confidence = r['score']
 
             ents.append(entity)
-        return create_main_ann(doc, ents)
+        return filter_linked_annotations(doc, ents)
 
     def _process(self,
                  stream: Iterable[Union[MutableDocument, None]],
diff --git a/medcat-v2/medcat/utils/postprocessing.py b/medcat-v2/medcat/utils/postprocessing.py
index dfd2582d4..0fd901015 100644
--- a/medcat-v2/medcat/utils/postprocessing.py
+++ b/medcat-v2/medcat/utils/postprocessing.py
@@ -1,13 +1,19 @@
 from medcat.tokenizing.tokenizers import MutableDocument, MutableEntity
 
 
+def create_main_ann(doc: MutableDocument, show_nested_entities: bool = False) -> None:
+    filter_linked_annotations(
+        doc, doc.linked_ents, show_nested_entities=show_nested_entities)
+
+
 # NOTE: the following used (in medcat v1) check tuis
 #       but they were never passed to the method so
 #       I've omitted it now
-def create_main_ann(doc: MutableDocument,
-                    linked_ents: list[MutableEntity],
-                    show_nested_entities: bool = False
-                    ) -> list[MutableEntity]:
+def filter_linked_annotations(
+        doc: MutableDocument,
+        linked_ents: list[MutableEntity],
+        show_nested_entities: bool = False
+        ) -> list[MutableEntity]:
     """Creates annotation in the spacy ents list
     from all the annotations for this document.
 
diff --git a/medcat-v2/tests/utils/test_postprocessing.py b/medcat-v2/tests/utils/test_postprocessing.py
index f99dbfe05..5fb77e0fe 100644
--- a/medcat-v2/tests/utils/test_postprocessing.py
+++ b/medcat-v2/tests/utils/test_postprocessing.py
@@ -2,7 +2,7 @@
 from unittest.mock import Mock, MagicMock
 from typing import List
 
-from medcat.utils.postprocessing import create_main_ann
+from medcat.utils.postprocessing import filter_linked_annotations
 from medcat.components.types import AbstractEntityProvidingComponent
 
 
@@ -63,7 +63,7 @@ def test_show_nested_entities_false_should_filter_overlaps(self):
         self.doc.ner_ents = [self.entity_chest_pain, self.entity_chest, self.entity_pain]
 
         AbstractEntityProvidingComponent.set_linked_ents(
-            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=False))
+            self.doc, filter_linked_annotations(self.doc, self.doc.ner_ents, show_nested_entities=False))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 
@@ -79,7 +79,7 @@ def test_show_nested_entities_true_should_keep_overlaps(self):
         self.doc.ner_ents = [self.entity_chest_pain, self.entity_chest, self.entity_pain]
 
         AbstractEntityProvidingComponent.set_linked_ents(
-            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=True))
+            self.doc, filter_linked_annotations(self.doc, self.doc.ner_ents, show_nested_entities=True))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 
@@ -101,7 +101,7 @@ def test_non_overlapping_entities_always_kept(self):
 
         # Test with show_nested_entities=False
         AbstractEntityProvidingComponent.set_linked_ents(
-            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=False))
+            self.doc, filter_linked_annotations(self.doc, self.doc.ner_ents, show_nested_entities=False))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 
@@ -136,7 +136,7 @@ def test_same_concept_multiple_locations(self):
         self.doc.ner_ents = [entity_chest_pain_1, entity_chest_pain_2, entity_chest_1, entity_pain_1_overlap]
 
         AbstractEntityProvidingComponent.set_linked_ents(
-            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=False))
+            self.doc, filter_linked_annotations(self.doc, self.doc.ner_ents, show_nested_entities=False))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
         entity_positions = [(ent.base.text, ent.base.start_char_index, ent.base.end_char_index)
@@ -177,7 +177,7 @@ def test_same_concept_multiple_locations_with_nested_true(self):
         self.doc.ner_ents = [entity_chest_pain_1, entity_chest_pain_2, entity_chest_1, entity_pain_1_overlap]
 
         AbstractEntityProvidingComponent.set_linked_ents(
-            self.doc, create_main_ann(self.doc, self.doc.ner_ents, show_nested_entities=True))
+            self.doc, filter_linked_annotations(self.doc, self.doc.ner_ents, show_nested_entities=True))
 
         entity_texts = [ent.base.text for ent in self.doc.linked_ents]
 

From 4c4113b6a1f1983ecd9f42bd3bf483ded2a844f2 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 11 Nov 2025 10:25:02 +0000
Subject: [PATCH 16/23] CU-869b44wz8: Deprecate old create_main_ann method

---
 medcat-v2/medcat/utils/postprocessing.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/medcat-v2/medcat/utils/postprocessing.py b/medcat-v2/medcat/utils/postprocessing.py
index 0fd901015..78e90f2b7 100644
--- a/medcat-v2/medcat/utils/postprocessing.py
+++ b/medcat-v2/medcat/utils/postprocessing.py
@@ -1,7 +1,16 @@
+import warnings
+
 from medcat.tokenizing.tokenizers import MutableDocument, MutableEntity
 
 
 def create_main_ann(doc: MutableDocument, show_nested_entities: bool = False) -> None:
+    warnings.warn(
+        "The `medcat.utils.postprocessing.create_main_ann` method is"
+        "depreacated and subject to removal in a future release. Please "
+        "use `medcat.utils.postprocessing.filter_linked_annotations` instead.",
+        DeprecationWarning,
+        stacklevel=2
+    )
     filter_linked_annotations(
         doc, doc.linked_ents, show_nested_entities=show_nested_entities)
 

From 4a56bbd3dd0212b2dfde9f862e7a4055a3bc9f56 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 11 Nov 2025 10:44:19 +0000
Subject: [PATCH 17/23] CU-869b44wz8: Use correct syntax in tutorials for
 maybe_annotate_name

---
 .../notebooks/advanced/2._Create_and_use_component.ipynb        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb b/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
index cd1471187..17f571c5f 100644
--- a/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
+++ b/medcat-v2-tutorials/notebooks/advanced/2._Create_and_use_component.ipynb
@@ -130,7 +130,7 @@
     "            #       safe to assume that these are all lists of tokens\n",
     "\n",
     "            # this checks the config (i.e length and stuff) and then annotes\n",
-    "            ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config)\n",
+    "            ent = maybe_annotate_name(self.tokenizer, linked_name, cur_tokens, doc, self.cdb, self.cdb.config, len(ner_ents))\n",
     "            if ent:\n",
     "                ner_ents.append(ent)\n",
     "        return ner_ents\n",

From f7fe6e9d108e721cd0e5c3d907dab0d066bed701 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Tue, 11 Nov 2025 10:45:01 +0000
Subject: [PATCH 18/23] CU-869b44wz8: Allow None for current ID and produce a
 unique ID if needed

---
 .../components/ner/vocab_based_annotator.py   | 25 +++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/medcat-v2/medcat/components/ner/vocab_based_annotator.py b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
index 4478a3432..06dc696f5 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_annotator.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
@@ -13,10 +13,13 @@
 logger = logging.getLogger(__name__)
 
 
+_START_INDEX_MULT = 1000
+
+
 def annotate_name(tokenizer: BaseTokenizer, name: str,
                   tkns: list[MutableToken],
                   doc: MutableDocument, cdb: CDB,
-                  cur_id: int,
+                  cur_id: int | None,
                   label: str):
     entity: MutableEntity = tokenizer.create_entity(
         doc, tkns[0].base.index, tkns[-1].base.index + 1, label=label)
@@ -25,6 +28,22 @@ def annotate_name(tokenizer: BaseTokenizer, name: str,
     # All standard name entity recognition models will not set this.
     entity.detected_name = name
     entity.link_candidates = list(cdb.name2info[name]['per_cui_status'])
+
+    if cur_id is None:
+        logger.warning(
+            "`medcat.components.ner.vocab_based_annotator.annotate_name` "
+            "was called with no `cur_id`. This behaviour is not fully "
+            "supported anymore.")
+        start_index = entity.base.start_char_index
+        span_len = len(name)
+        cur_id = start_index * _START_INDEX_MULT + span_len
+        # NOTE: These will be unique if the maximum length of each
+        #       entity does not exceed _START_INDEX_MULT (1000)
+        logger.warning(
+            "Using the text start index %d (multiplied by %d) and adding "
+            "the span length %d to get the id of %d", start_index,
+            _START_INDEX_MULT, span_len, cur_id)
+
     entity.id = cur_id
     entity.confidence = -1  # This does not calculate confidence
 
@@ -38,7 +57,7 @@ def annotate_name(tokenizer: BaseTokenizer, name: str,
 def maybe_annotate_name(tokenizer: BaseTokenizer, name: str,
                         tkns: list[MutableToken],
                         doc: MutableDocument, cdb: CDB, config: Config,
-                        cur_id: int,
+                        cur_id: int | None = None,
                         label: str = 'concept'
                         ) -> Optional[MutableEntity]:
     """Given a name it will check should it be annotated based on config rules.
@@ -57,6 +76,8 @@ def maybe_annotate_name(tokenizer: BaseTokenizer, name: str,
             Concept database.
         config (Config):
             Global config for medcat.
+        cur_id (int | None):
+            The potential ID for the entity. Defaults to None.
         label (str):
             Label for this name (usually `concept` if we are using
             a vocab based approach).

From dafc986bb523ddde2b32869c0c36146c2ee91efb Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 12 Nov 2025 10:28:02 +0000
Subject: [PATCH 19/23] CU-869b44wz8: Add entity to doc.ner_ents during
 annotate_name if no ID (i.e old API) is used to preserve previous
 functionality

---
 medcat-v2/medcat/components/ner/vocab_based_annotator.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/medcat-v2/medcat/components/ner/vocab_based_annotator.py b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
index 06dc696f5..6e28e6306 100644
--- a/medcat-v2/medcat/components/ner/vocab_based_annotator.py
+++ b/medcat-v2/medcat/components/ner/vocab_based_annotator.py
@@ -43,6 +43,11 @@ def annotate_name(tokenizer: BaseTokenizer, name: str,
             "Using the text start index %d (multiplied by %d) and adding "
             "the span length %d to get the id of %d", start_index,
             _START_INDEX_MULT, span_len, cur_id)
+        logger.warning(
+            "Setting MutableDocument.ner_ents during the method "
+            "`medcat.components.ner.vocab_based_annotator.annotate_name` "
+            "because the old API (without an ID) was used")
+        doc.ner_ents.append(entity)  # TODO: remove this
 
     entity.id = cur_id
     entity.confidence = -1  # This does not calculate confidence

From 82577efdad7c3a5203744a632876b56b55535f56 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 12 Nov 2025 10:40:36 +0000
Subject: [PATCH 20/23] CU-869b44wz8: Add a few tests for old and new API for
 maybe_annnotate_name

---
 .../ner/test_vocab_based_annotator.py         | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)
 create mode 100644 medcat-v2/tests/components/ner/test_vocab_based_annotator.py

diff --git a/medcat-v2/tests/components/ner/test_vocab_based_annotator.py b/medcat-v2/tests/components/ner/test_vocab_based_annotator.py
new file mode 100644
index 000000000..877cd683a
--- /dev/null
+++ b/medcat-v2/tests/components/ner/test_vocab_based_annotator.py
@@ -0,0 +1,41 @@
+from collections import defaultdict
+
+from medcat.components.ner import vocab_based_annotator
+from medcat.tokenizing.tokenizers import create_tokenizer
+from medcat.config import Config
+
+import unittest
+import unittest.mock
+
+
+class MaybeAnnotateNameTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.cnf = Config()
+        cls.tokenizer = create_tokenizer("regex", cls.cnf)
+        cls.example_name = "some long name"
+        cls.tokens = list(cls.tokenizer(cls.example_name)[:])
+        cls.mock_cdb = unittest.mock.Mock()
+        cls.mock_cdb.name2info = defaultdict(lambda: defaultdict(lambda: "P"))
+
+    def setUp(self):
+        self.mock_doc = unittest.mock.Mock()
+        # self.mock_doc.ner_ents = unittest.mock.Mock()
+        self.mock_doc._tokens = self.tokens
+        self.mock_doc.ner_ents.append = unittest.mock.Mock()
+        self.mock_doc.ner_ents.__len__ = unittest.mock.Mock(return_value=0)
+
+    def test_old_API_has_side_effects(self):
+        vocab_based_annotator.maybe_annotate_name(
+            self.tokenizer, self.example_name,
+            tkns=self.tokens, doc=self.mock_doc, cdb=self.mock_cdb,
+            config=self.cnf)
+        self.mock_doc.ner_ents.append.assert_called_once()
+
+    def test_new_API_has_no_side_effects(self):
+        vocab_based_annotator.maybe_annotate_name(
+            self.tokenizer, self.example_name,
+            tkns=self.tokens, doc=self.mock_doc, cdb=self.mock_cdb,
+            config=self.cnf, cur_id=1)
+        self.mock_doc.ner_ents.append.assert_not_called()

From 0723230d2bb435400b54f6712bc8d4926987d849 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 12 Nov 2025 10:55:25 +0000
Subject: [PATCH 21/23] CU-869b44wz8: Fix old behaviour of create_main_ann

---
 medcat-v2/medcat/utils/postprocessing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/medcat-v2/medcat/utils/postprocessing.py b/medcat-v2/medcat/utils/postprocessing.py
index 78e90f2b7..817876b8b 100644
--- a/medcat-v2/medcat/utils/postprocessing.py
+++ b/medcat-v2/medcat/utils/postprocessing.py
@@ -11,8 +11,8 @@ def create_main_ann(doc: MutableDocument, show_nested_entities: bool = False) ->
         DeprecationWarning,
         stacklevel=2
     )
-    filter_linked_annotations(
-        doc, doc.linked_ents, show_nested_entities=show_nested_entities)
+    doc.linked_ents = filter_linked_annotations(  # type: ignore
+        doc, doc.ner_ents, show_nested_entities=show_nested_entities)
 
 
 # NOTE: the following used (in medcat v1) check tuis

From 5dd573940b698c1e9c7879fdb3eb1ac15fbe95d5 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 12 Nov 2025 10:56:05 +0000
Subject: [PATCH 22/23] CU-869b44wz8: Add a few small tests fro create_main_ann
 and filter_linked_annotations

---
 medcat-v2/tests/utils/test_postprocessing.py | 26 +++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/medcat-v2/tests/utils/test_postprocessing.py b/medcat-v2/tests/utils/test_postprocessing.py
index 5fb77e0fe..8269fa168 100644
--- a/medcat-v2/tests/utils/test_postprocessing.py
+++ b/medcat-v2/tests/utils/test_postprocessing.py
@@ -1,8 +1,9 @@
 import unittest
+import unittest.mock
 from unittest.mock import Mock, MagicMock
 from typing import List
 
-from medcat.utils.postprocessing import filter_linked_annotations
+from medcat.utils.postprocessing import filter_linked_annotations, create_main_ann
 from medcat.components.types import AbstractEntityProvidingComponent
 
 
@@ -188,5 +189,28 @@ def test_same_concept_multiple_locations_with_nested_true(self):
         self.assertIn("pain", entity_texts, "Should keep overlapping 'pain' entity")
 
 
+class TestCreateMainAnn(unittest.TestCase):
+
+    def setUp(self):
+        # self.mock_doc = unittest.mock.Mock()
+        # self.mock_doc.linked_ents.__iter__ = unittest.mock.Mock(
+        #     return_value=iter([]))
+        self.mock_doc = create_mock_document(
+            f"{'st0':10s}{'st1':10s}{'st2':10s}{'st3':10s}")
+        # self.mock_doc.linked_ents.append = unittest.mock.Mock()
+        self.mock_entities = [create_mock_entity(
+            f"st{index}", index * 10, index * 10 + 3, cui="C1"
+        ) for index in range(4)]
+        self.mock_doc.ner_ents = self.mock_entities
+
+    def test_create_main_ann_has_side_effect(self):
+        create_main_ann(self.mock_doc)
+        self.assertGreaterEqual(len(self.mock_doc.linked_ents), 1)
+
+    def test_filter_linked_annotations_has_no_side_effect(self):
+        filter_linked_annotations(self.mock_doc, self.mock_entities)
+        self.assertEqual(len(self.mock_doc.linked_ents), 0)
+
+
 if __name__ == '__main__':
     unittest.main()
\ No newline at end of file

From 14fc9aeeab0abeb5ee86f3666514d00dca3ae3ab Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Wed, 12 Nov 2025 10:56:47 +0000
Subject: [PATCH 23/23] CU-869b44wz8: Add a baseline test

---
 medcat-v2/tests/utils/test_postprocessing.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/medcat-v2/tests/utils/test_postprocessing.py b/medcat-v2/tests/utils/test_postprocessing.py
index 8269fa168..42c1410c8 100644
--- a/medcat-v2/tests/utils/test_postprocessing.py
+++ b/medcat-v2/tests/utils/test_postprocessing.py
@@ -203,6 +203,9 @@ def setUp(self):
         ) for index in range(4)]
         self.mock_doc.ner_ents = self.mock_entities
 
+    def test_init_doc_has_no_linked_ents(self):
+        self.assertEqual(len(self.mock_doc.linked_ents), 0)
+
     def test_create_main_ann_has_side_effect(self):
         create_main_ann(self.mock_doc)
         self.assertGreaterEqual(len(self.mock_doc.linked_ents), 1)