CogStack · mart-r · Sep 19, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 15, 2025
diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -324,7 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken],
 
 
 class RegexTokenizer(BaseTokenizer):
-    REGEX = r'((\b\w+\b|\S+)\s?)'
+    REGEX = re.compile(r'(([^a-zA-Z0-9\s]+|\b\w+\b|\S+)\s?)')
     # group 1: text with whitespace (if present)
     # group 2: text with no whitespace
 
@@ -347,8 +347,12 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         end_index = doc._tokens.index(tokens[-1])
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
+    def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
+        tokens = self.REGEX.finditer(text)
+        return list(tokens)
+
     def __call__(self, text: str) -> MutableDocument:
-        tokens = re.finditer(self.REGEX, text)
+        tokens = self._get_tokens_matches(text)
         doc = Document(text)
         for tkn_index, match in enumerate(tokens):
             start_index = match.start()

diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import numpy as np
+import pandas as pd
 from medcat.model_creation.cdb_maker import CDBMaker
 from medcat.cdb import CDB
 from medcat.config import Config
@@ -15,12 +16,15 @@
 
 
 class CDBMakerBaseTests(unittest.TestCase):
+    use_spacy = False
 
     @classmethod
     def setUpClass(cls):
         cls.config = Config()
         cls.config.general.log_level = logging.DEBUG
-        cls.config.general.nlp.modelname = "en_core_web_md"
+        if cls.use_spacy:
+            cls.config.general.nlp.provider = 'spacy'
+            cls.config.general.nlp.modelname = "en_core_web_md"
         cls.maker = CDBMaker(cls.config)
         csvs = [
             os.path.join(MODEL_CREATION_RES_PATH, 'cdb.csv'),
@@ -29,6 +33,40 @@ def setUpClass(cls):
         cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
 
 
+class MakeWithDashes(CDBMakerBaseTests):
+    cui = '69482004'
+    namelist = ["Korsakoff's psychosis",
+                'Wernicke-Korsakoff syndrome',
+                'Korsakov syndrome - alcoholic']
+    expected_names = [
+        # NOTE: whitespace and punctuation (e.g spaces, dashes)
+        #       are replaced with separator (~) here
+        #       and names are lower case
+        #       notably, only 1 separator at a time is shown
+        "korsakoff~s~psychosis",
+        "wernicke~korsakoff~syndrome",
+        "korsakov~syndrome~alcoholic",
+    ]
+    cui_df = pd.DataFrame({'cui': cui, 'name': namelist})
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.maker.prepare_csvs([cls.cui_df, ], full_build=True)
+
+    def test_has_cui(self):
+        self.assertIn(self.cui, self.cdb.cui2info)
+
+    def test_has_full_names(self):
+        for name in self.expected_names:
+            with self.subTest(f"Name: {name}"):
+                self.assertIn(name, self.cdb.name2info.keys())
+
+
+class MakeWithDashesSpacy(MakeWithDashes):
+    use_spacy = True
+
+
 class CDBMakerLoadTests(CDBMakerBaseTests):
     EXPECTED_NAMES = {
         'C0000039': {'virus~k', 'virus', 'virus~m', 'virus~z'},

diff --git a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
@@ -8,7 +8,8 @@ class TokenizerTests(TestCase):
     TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! "
                    "And then some!")
     EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32",
-                  "numbers", "2", "-tokenize!", "And", "then", "some", "!"]
+                  "numbers", "2", "-", "tokenize", "!", "And", "then", "some",
+                  "!"]
     BIG_NUMBER = 10_000_000
 
     @classmethod