From b53f1406d53de0f8bd74ab985c3224428b6862c9 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 7 Jul 2025 16:42:22 +0100 Subject: [PATCH 1/4] CU-8699py5m0: Fix ordering of text/index when batching on a per char length basis --- medcat-v2/medcat/cat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/cat.py b/medcat-v2/medcat/cat.py index 2226819d7..31f2af141 100644 --- a/medcat-v2/medcat/cat.py +++ b/medcat-v2/medcat/cat.py @@ -180,7 +180,7 @@ def _generate_batches_by_char_length( yield docs docs = [] char_count = clen - docs.append((doc_index, doc, only_cui)) + docs.append((doc, doc_index, only_cui)) if len(docs) > 0: yield docs From 32144d10aae737fbbc928e70eb7dbb1dfb6b9714 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 7 Jul 2025 16:53:36 +0100 Subject: [PATCH 2/4] CU-8699py5m0: Set addon data paths for other threads upon multiprocessing --- medcat-v2/medcat/cat.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/medcat-v2/medcat/cat.py b/medcat-v2/medcat/cat.py index 31f2af141..c392fa1e1 100644 --- a/medcat-v2/medcat/cat.py +++ b/medcat-v2/medcat/cat.py @@ -31,6 +31,7 @@ from medcat.utils.defaults import doing_legacy_conversion_message from medcat.utils.defaults import LegacyConversionDisabledError from medcat.utils.usage_monitoring import UsageMonitor +from medcat.utils.import_utils import MissingDependenciesError logger = logging.getLogger(__name__) @@ -157,6 +158,25 @@ def _mp_worker_func( self, texts_and_indices: list[tuple[str, str, bool]] ) -> list[tuple[str, str, Union[dict, Entities, OnlyCUIEntities]]]: + # NOTE: this is needed for subprocess as otherwise they wouldn't have + # any of these set + # NOTE: these need to by dynamic in case the extra's aren't included + try: + from medcat.components.addons.meta_cat import MetaCATAddon + has_meta_cat = True + except MissingDependenciesError: + has_meta_cat = False + try: + from medcat.components.addons.relation_extraction.rel_cat import ( + RelCATAddon) + has_rel_cat = True + except MissingDependenciesError: + has_rel_cat = False + for addon in self._pipeline.iter_addons(): + if has_meta_cat and isinstance(addon, MetaCATAddon): + addon._init_data_paths(self._pipeline.tokenizer) + elif has_rel_cat and isinstance(addon, RelCATAddon): + addon._rel_cat._init_data_paths() return [ (text, text_index, self.get_entities(text, only_cui=only_cui)) for text, text_index, only_cui in texts_and_indices] From 2c74af1af5d5fdfdc8be42ef3be25c7e4cdb12a8 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 7 Jul 2025 18:45:10 +0100 Subject: [PATCH 3/4] CU-8699py5m0: Fix ordering of text and index when doing sequentially --- medcat-v2/medcat/cat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/medcat/cat.py b/medcat-v2/medcat/cat.py index c392fa1e1..cd45d4896 100644 --- a/medcat-v2/medcat/cat.py +++ b/medcat-v2/medcat/cat.py @@ -346,7 +346,7 @@ def get_entities_multi_texts( if n_process == 1: # just do in series for batch in batch_iter: - for text_index, _, result in self._mp_worker_func(batch): + for _, text_index, result in self._mp_worker_func(batch): yield text_index, result return From fd75294eb5626a19a0dd34cc39785140abdde658 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 7 Jul 2025 18:46:51 +0100 Subject: [PATCH 4/4] CU-8699py5m0: Update tests with correct order of text/index for multiprocessing --- medcat-v2/tests/test_cat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat-v2/tests/test_cat.py b/medcat-v2/tests/test_cat.py index 4c8263b54..f945903cc 100644 --- a/medcat-v2/tests/test_cat.py +++ b/medcat-v2/tests/test_cat.py @@ -728,7 +728,7 @@ def test_batching_gets_full_char(self): # has all texts self.assertEqual(sum(len(batch) for batch in batches), self.NUM_TEXTS) # has all characters - self.assertEqual(sum(len(text[1]) for text in batches[0]), + self.assertEqual(sum(len(text[0]) for text in batches[0]), self.total_text_length) def test_batching_gets_all_half_at_a_time(self): @@ -746,7 +746,7 @@ def test_batching_gets_all_half_at_a_time(self): # has all texts self.assertEqual(sum(len(batch) for batch in batches), self.NUM_TEXTS) # has all characters - self.assertEqual(sum(len(text[1]) + self.assertEqual(sum(len(text[0]) for batch in batches for text in batch), self.total_text_length)