From 56456f064137d2d5cf015256f77cec28a85d85aa Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 14:06:56 +0100 Subject: [PATCH 01/13] CU-869ag0tqj: Add tests to show issue with dashes in CDB maker with regex-based tokenizer --- .../tests/model_creation/test_cdb_maker.py | 39 ++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py index daec0a4e1..eba23447c 100644 --- a/medcat-v2/tests/model_creation/test_cdb_maker.py +++ b/medcat-v2/tests/model_creation/test_cdb_maker.py @@ -2,6 +2,7 @@ import logging import os import numpy as np +import pandas as pd from medcat.model_creation.cdb_maker import CDBMaker from medcat.cdb import CDB from medcat.config import Config @@ -15,12 +16,15 @@ class CDBMakerBaseTests(unittest.TestCase): + use_spacy = False @classmethod def setUpClass(cls): cls.config = Config() cls.config.general.log_level = logging.DEBUG - cls.config.general.nlp.modelname = "en_core_web_md" + if cls.use_spacy: + cls.config.general.nlp.provider = 'spacy' + cls.config.general.nlp.modelname = "en_core_web_md" cls.maker = CDBMaker(cls.config) csvs = [ os.path.join(MODEL_CREATION_RES_PATH, 'cdb.csv'), @@ -29,6 +33,39 @@ def setUpClass(cls): cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) +class MakeWithDashes(CDBMakerBaseTests): + cui = '69482004' + namelist = ["Korsakoff's psychosis", + 'Wernicke-Korsakoff syndrome', + 'Korsakov syndrome - alcoholic'] + cui_df = pd.DataFrame({'cui': cui, 'name': namelist}) + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.maker.prepare_csvs([cls.cui_df, ]) + + def test_has_cui(self): + self.assertIn(self.cui, self.cdb.cui2info) + + def test_has_full_names(self): + sep = self.config.general.separator + for _name in self.namelist: + # lowercase and pre-condition + name = _name.lower() + name = name.replace(" ", sep) + name = name.replace("-", sep) + name = name.replace("'", sep) + name = name.replace(sep * 2, sep) + name = name.replace(sep * 2, sep) + with self.subTest(f"Name: {_name} ({name})"): + self.assertIn(name, self.cdb.name2info.keys()) + + +class MakeWithDashesSpacy(MakeWithDashes): + use_spacy = True + + class CDBMakerLoadTests(CDBMakerBaseTests): EXPECTED_NAMES = { 'C0000039': {'virus~k', 'virus', 'virus~m', 'virus~z'}, From 685eb8098a54ae0a84db08f3e88dd312b0f39dc1 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 16:26:44 +0100 Subject: [PATCH 02/13] CU-869ag0tqj: Make a full build at test time when adding dash-based names --- medcat-v2/tests/model_creation/test_cdb_maker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py index eba23447c..79618c76d 100644 --- a/medcat-v2/tests/model_creation/test_cdb_maker.py +++ b/medcat-v2/tests/model_creation/test_cdb_maker.py @@ -43,7 +43,7 @@ class MakeWithDashes(CDBMakerBaseTests): @classmethod def setUpClass(cls): super().setUpClass() - cls.maker.prepare_csvs([cls.cui_df, ]) + cls.maker.prepare_csvs([cls.cui_df, ], full_build=True) def test_has_cui(self): self.assertIn(self.cui, self.cdb.cui2info) From 9276ef7470c1069cd7cc28526df1d3567d5c0f32 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 16:27:46 +0100 Subject: [PATCH 03/13] CU-869ag0tqj: Separate starting punctuation in regex based tokenizer --- .../medcat/tokenizing/regex_impl/tokenizer.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 874f51dcb..808276d67 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -324,6 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken], class RegexTokenizer(BaseTokenizer): + PUNCT = "'\"-_.,:;()[]{}<>*&^%$@!?|\\/+=" REGEX = r'((\b\w+\b|\S+)\s?)' # group 1: text with whitespace (if present) # group 2: text with no whitespace @@ -348,7 +349,18 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: return _entity_from_tokens(doc, tokens, start_index, end_index) def __call__(self, text: str) -> MutableDocument: - tokens = re.finditer(self.REGEX, text) + _tokens = re.finditer(self.REGEX, text) + tokens: list[re.Match[str]] = [] + for tkn in _tokens: + t_text = tkn.group() + if t_text and t_text[0] in self.PUNCT: + before = re.match(r"((.))", t_text[0]) + tokens.append(before) + if len(t_text.strip()) > 1: + after = re.match(self.REGEX, t_text[1:]) + tokens.append(after) + else: + tokens.append(tkn) doc = Document(text) for tkn_index, match in enumerate(tokens): start_index = match.start() From 0f4b7f369a5490c45db1a51d5d6441bfbfa0e339 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 16:34:10 +0100 Subject: [PATCH 04/13] CU-869ag0tqj: Use compiled regex for regex-based tokenizer --- medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 808276d67..5c7e1ac80 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -324,8 +324,8 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken], class RegexTokenizer(BaseTokenizer): - PUNCT = "'\"-_.,:;()[]{}<>*&^%$@!?|\\/+=" - REGEX = r'((\b\w+\b|\S+)\s?)' + PUNCT_REGEX = re.compile(r'[^a-zA-Z0-9]+') + REGEX = re.compile(r'((\b\w+\b|\S+)\s?)') # group 1: text with whitespace (if present) # group 2: text with no whitespace @@ -349,11 +349,11 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: return _entity_from_tokens(doc, tokens, start_index, end_index) def __call__(self, text: str) -> MutableDocument: - _tokens = re.finditer(self.REGEX, text) + _tokens = self.REGEX.finditer(text) tokens: list[re.Match[str]] = [] for tkn in _tokens: t_text = tkn.group() - if t_text and t_text[0] in self.PUNCT: + if t_text and self.PUNCT_REGEX.match(t_text[0]): before = re.match(r"((.))", t_text[0]) tokens.append(before) if len(t_text.strip()) > 1: From 7ebcb8701fc96a122d2738b81021a9ed3c04ae4b Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 16:36:36 +0100 Subject: [PATCH 05/13] CU-869ag0tqj: Fix small typing issues --- medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 5c7e1ac80..1d4df8eec 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -355,9 +355,17 @@ def __call__(self, text: str) -> MutableDocument: t_text = tkn.group() if t_text and self.PUNCT_REGEX.match(t_text[0]): before = re.match(r"((.))", t_text[0]) + if before is None: + raise ValueError( + "Got an unmatched character somehow (before): " + f"'{t_text[0]}'") tokens.append(before) if len(t_text.strip()) > 1: after = re.match(self.REGEX, t_text[1:]) + if after is None: + raise ValueError( + "Got an unmatched character somehow (after): " + f"'{t_text[1:]}'") tokens.append(after) else: tokens.append(tkn) From cde7e3495de5492bd4a38f9b08dcc5ea4372e5c0 Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 16:37:11 +0100 Subject: [PATCH 06/13] CU-869ag0tqj: Add a small comment --- medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 1d4df8eec..d7add3e85 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -356,6 +356,7 @@ def __call__(self, text: str) -> MutableDocument: if t_text and self.PUNCT_REGEX.match(t_text[0]): before = re.match(r"((.))", t_text[0]) if before is None: + # NOTE: explicitly cannot happen since anything goes raise ValueError( "Got an unmatched character somehow (before): " f"'{t_text[0]}'") @@ -363,6 +364,7 @@ def __call__(self, text: str) -> MutableDocument: if len(t_text.strip()) > 1: after = re.match(self.REGEX, t_text[1:]) if after is None: + # NOTE: explicitly cannot happen since there's a check raise ValueError( "Got an unmatched character somehow (after): " f"'{t_text[1:]}'") From faf2e2c9f082c72c57f07b047ddc00013f97502a Mon Sep 17 00:00:00 2001 From: mart-r Date: Mon, 15 Sep 2025 16:56:40 +0100 Subject: [PATCH 07/13] CU-869ag0tqj: Update regex-based tokenizer tests --- medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py index e30f70821..718b9202d 100644 --- a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py +++ b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py @@ -8,7 +8,7 @@ class TokenizerTests(TestCase): TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! " "And then some!") EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32", - "numbers", "2", "-tokenize!", "And", "then", "some", "!"] + "numbers", "2", "-", "tokenize", "And", "then", "some", "!"] BIG_NUMBER = 10_000_000 @classmethod From b99bedc8d6b5f6211826377ddb09de29889458e5 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 18 Sep 2025 09:43:09 +0100 Subject: [PATCH 08/13] CU-869ag0tqj: Refactor token getting in regex tokenizer somewhat --- medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index d7add3e85..7dd4f3f29 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -348,7 +348,7 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: end_index = doc._tokens.index(tokens[-1]) return _entity_from_tokens(doc, tokens, start_index, end_index) - def __call__(self, text: str) -> MutableDocument: + def _get_tokens_matches(self, text: str) -> list[re.Match[str]]: _tokens = self.REGEX.finditer(text) tokens: list[re.Match[str]] = [] for tkn in _tokens: @@ -371,6 +371,10 @@ def __call__(self, text: str) -> MutableDocument: tokens.append(after) else: tokens.append(tkn) + return tokens + + def __call__(self, text: str) -> MutableDocument: + tokens = self._get_tokens_matches(text) doc = Document(text) for tkn_index, match in enumerate(tokens): start_index = match.start() From e826e219d5f5768bdd04f8f1f7bb0a51e2b4c053 Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 18 Sep 2025 09:46:09 +0100 Subject: [PATCH 09/13] CU-869ag0tqj: Separate punctuation getting for regex tokenizing --- medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 7dd4f3f29..b27653289 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -348,8 +348,9 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: end_index = doc._tokens.index(tokens[-1]) return _entity_from_tokens(doc, tokens, start_index, end_index) - def _get_tokens_matches(self, text: str) -> list[re.Match[str]]: - _tokens = self.REGEX.finditer(text) + def _split_punctuation_into_separate_matches( + self, _tokens: Iterator[re.Match[str]] + ) -> list[re.Match[str]]: tokens: list[re.Match[str]] = [] for tkn in _tokens: t_text = tkn.group() @@ -373,6 +374,10 @@ def _get_tokens_matches(self, text: str) -> list[re.Match[str]]: tokens.append(tkn) return tokens + def _get_tokens_matches(self, text: str) -> list[re.Match[str]]: + tokens = self.REGEX.finditer(text) + return self._split_punctuation_into_separate_matches(tokens) + def __call__(self, text: str) -> MutableDocument: tokens = self._get_tokens_matches(text) doc = Document(text) From e037bbded27959cfb2d431d01566dde36d72ed6d Mon Sep 17 00:00:00 2001 From: mart-r Date: Thu, 18 Sep 2025 13:38:42 +0100 Subject: [PATCH 10/13] CU-869ag0tqj: Add some further comments in code --- medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index b27653289..0990eaf4c 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -354,7 +354,9 @@ def _split_punctuation_into_separate_matches( tokens: list[re.Match[str]] = [] for tkn in _tokens: t_text = tkn.group() + # checking if first character is punctuation if t_text and self.PUNCT_REGEX.match(t_text[0]): + # if it is, then separate it to a separate Match object before = re.match(r"((.))", t_text[0]) if before is None: # NOTE: explicitly cannot happen since anything goes @@ -363,6 +365,9 @@ def _split_punctuation_into_separate_matches( f"'{t_text[0]}'") tokens.append(before) if len(t_text.strip()) > 1: + # if there's something other than the first element + # i.e more than just the punctuation + # use the rest as a separate match after = re.match(self.REGEX, t_text[1:]) if after is None: # NOTE: explicitly cannot happen since there's a check From 0ec0c7a4b40a46945fc5159eccc253bdd9b64293 Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 19 Sep 2025 09:06:27 +0100 Subject: [PATCH 11/13] CU-869ag0tqj: Fix regex tokenizer expected results --- medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py index 718b9202d..0d5507826 100644 --- a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py +++ b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py @@ -8,7 +8,8 @@ class TokenizerTests(TestCase): TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! " "And then some!") EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32", - "numbers", "2", "-", "tokenize", "And", "then", "some", "!"] + "numbers", "2", "-", "tokenize", "!", "And", "then", "some", + "!"] BIG_NUMBER = 10_000_000 @classmethod From c6963cf53475b144b0e17d73a3149097a1b4661b Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 19 Sep 2025 09:07:03 +0100 Subject: [PATCH 12/13] CU-869ag0tqj: Simplify regex for punctuation separation in regex tokenizer --- .../medcat/tokenizing/regex_impl/tokenizer.py | 36 ++----------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 0990eaf4c..01b60b016 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -324,8 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken], class RegexTokenizer(BaseTokenizer): - PUNCT_REGEX = re.compile(r'[^a-zA-Z0-9]+') - REGEX = re.compile(r'((\b\w+\b|\S+)\s?)') + REGEX = re.compile(r'(([^a-zA-Z0-9\s]+|\b\w+\b|\S+)\s?)') # group 1: text with whitespace (if present) # group 2: text with no whitespace @@ -348,40 +347,9 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: end_index = doc._tokens.index(tokens[-1]) return _entity_from_tokens(doc, tokens, start_index, end_index) - def _split_punctuation_into_separate_matches( - self, _tokens: Iterator[re.Match[str]] - ) -> list[re.Match[str]]: - tokens: list[re.Match[str]] = [] - for tkn in _tokens: - t_text = tkn.group() - # checking if first character is punctuation - if t_text and self.PUNCT_REGEX.match(t_text[0]): - # if it is, then separate it to a separate Match object - before = re.match(r"((.))", t_text[0]) - if before is None: - # NOTE: explicitly cannot happen since anything goes - raise ValueError( - "Got an unmatched character somehow (before): " - f"'{t_text[0]}'") - tokens.append(before) - if len(t_text.strip()) > 1: - # if there's something other than the first element - # i.e more than just the punctuation - # use the rest as a separate match - after = re.match(self.REGEX, t_text[1:]) - if after is None: - # NOTE: explicitly cannot happen since there's a check - raise ValueError( - "Got an unmatched character somehow (after): " - f"'{t_text[1:]}'") - tokens.append(after) - else: - tokens.append(tkn) - return tokens - def _get_tokens_matches(self, text: str) -> list[re.Match[str]]: tokens = self.REGEX.finditer(text) - return self._split_punctuation_into_separate_matches(tokens) + return list(tokens) def __call__(self, text: str) -> MutableDocument: tokens = self._get_tokens_matches(text) From 6a7e21a889519dd7de9e93c5a142d29c180c991c Mon Sep 17 00:00:00 2001 From: mart-r Date: Fri, 19 Sep 2025 10:20:43 +0100 Subject: [PATCH 13/13] CU-869ag0tqj: Add explicit list of expected names in CDB maker test with dashes --- .../tests/model_creation/test_cdb_maker.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py index 79618c76d..4fc5e4415 100644 --- a/medcat-v2/tests/model_creation/test_cdb_maker.py +++ b/medcat-v2/tests/model_creation/test_cdb_maker.py @@ -38,6 +38,15 @@ class MakeWithDashes(CDBMakerBaseTests): namelist = ["Korsakoff's psychosis", 'Wernicke-Korsakoff syndrome', 'Korsakov syndrome - alcoholic'] + expected_names = [ + # NOTE: whitespace and punctuation (e.g spaces, dashes) + # are replaced with separator (~) here + # and names are lower case + # notably, only 1 separator at a time is shown + "korsakoff~s~psychosis", + "wernicke~korsakoff~syndrome", + "korsakov~syndrome~alcoholic", + ] cui_df = pd.DataFrame({'cui': cui, 'name': namelist}) @classmethod @@ -49,16 +58,8 @@ def test_has_cui(self): self.assertIn(self.cui, self.cdb.cui2info) def test_has_full_names(self): - sep = self.config.general.separator - for _name in self.namelist: - # lowercase and pre-condition - name = _name.lower() - name = name.replace(" ", sep) - name = name.replace("-", sep) - name = name.replace("'", sep) - name = name.replace(sep * 2, sep) - name = name.replace(sep * 2, sep) - with self.subTest(f"Name: {_name} ({name})"): + for name in self.expected_names: + with self.subTest(f"Name: {name}"): self.assertIn(name, self.cdb.name2info.keys())