From 56456f064137d2d5cf015256f77cec28a85d85aa Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 14:06:56 +0100
Subject: [PATCH 01/13] CU-869ag0tqj: Add tests to show issue with dashes in
 CDB maker with regex-based tokenizer

---
 .../tests/model_creation/test_cdb_maker.py    | 39 ++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py
index daec0a4e1..eba23447c 100644
--- a/medcat-v2/tests/model_creation/test_cdb_maker.py
+++ b/medcat-v2/tests/model_creation/test_cdb_maker.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import numpy as np
+import pandas as pd
 from medcat.model_creation.cdb_maker import CDBMaker
 from medcat.cdb import CDB
 from medcat.config import Config
@@ -15,12 +16,15 @@
 
 
 class CDBMakerBaseTests(unittest.TestCase):
+    use_spacy = False
 
     @classmethod
     def setUpClass(cls):
         cls.config = Config()
         cls.config.general.log_level = logging.DEBUG
-        cls.config.general.nlp.modelname = "en_core_web_md"
+        if cls.use_spacy:
+            cls.config.general.nlp.provider = 'spacy'
+            cls.config.general.nlp.modelname = "en_core_web_md"
         cls.maker = CDBMaker(cls.config)
         csvs = [
             os.path.join(MODEL_CREATION_RES_PATH, 'cdb.csv'),
@@ -29,6 +33,39 @@ def setUpClass(cls):
         cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
 
 
+class MakeWithDashes(CDBMakerBaseTests):
+    cui = '69482004'
+    namelist = ["Korsakoff's psychosis",
+                'Wernicke-Korsakoff syndrome',
+                'Korsakov syndrome - alcoholic']
+    cui_df = pd.DataFrame({'cui': cui, 'name': namelist})
+
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.maker.prepare_csvs([cls.cui_df, ])
+
+    def test_has_cui(self):
+        self.assertIn(self.cui, self.cdb.cui2info)
+
+    def test_has_full_names(self):
+        sep = self.config.general.separator
+        for _name in self.namelist:
+            # lowercase and pre-condition
+            name = _name.lower()
+            name = name.replace(" ", sep)
+            name = name.replace("-", sep)
+            name = name.replace("'", sep)
+            name = name.replace(sep * 2, sep)
+            name = name.replace(sep * 2, sep)
+            with self.subTest(f"Name: {_name} ({name})"):
+                self.assertIn(name, self.cdb.name2info.keys())
+
+
+class MakeWithDashesSpacy(MakeWithDashes):
+    use_spacy = True
+
+
 class CDBMakerLoadTests(CDBMakerBaseTests):
     EXPECTED_NAMES = {
         'C0000039': {'virus~k', 'virus', 'virus~m', 'virus~z'},

From 685eb8098a54ae0a84db08f3e88dd312b0f39dc1 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 16:26:44 +0100
Subject: [PATCH 02/13] CU-869ag0tqj: Make a full build at test time when
 adding dash-based names

---
 medcat-v2/tests/model_creation/test_cdb_maker.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py
index eba23447c..79618c76d 100644
--- a/medcat-v2/tests/model_creation/test_cdb_maker.py
+++ b/medcat-v2/tests/model_creation/test_cdb_maker.py
@@ -43,7 +43,7 @@ class MakeWithDashes(CDBMakerBaseTests):
     @classmethod
     def setUpClass(cls):
         super().setUpClass()
-        cls.maker.prepare_csvs([cls.cui_df, ])
+        cls.maker.prepare_csvs([cls.cui_df, ], full_build=True)
 
     def test_has_cui(self):
         self.assertIn(self.cui, self.cdb.cui2info)

From 9276ef7470c1069cd7cc28526df1d3567d5c0f32 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 16:27:46 +0100
Subject: [PATCH 03/13] CU-869ag0tqj: Separate starting punctuation in regex
 based tokenizer

---
 .../medcat/tokenizing/regex_impl/tokenizer.py      | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 874f51dcb..808276d67 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -324,6 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken],
 
 
 class RegexTokenizer(BaseTokenizer):
+    PUNCT = "'\"-_.,:;()[]{}<>*&^%$@!?|\\/+="
     REGEX = r'((\b\w+\b|\S+)\s?)'
     # group 1: text with whitespace (if present)
     # group 2: text with no whitespace
@@ -348,7 +349,18 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
     def __call__(self, text: str) -> MutableDocument:
-        tokens = re.finditer(self.REGEX, text)
+        _tokens = re.finditer(self.REGEX, text)
+        tokens: list[re.Match[str]] = []
+        for tkn in _tokens:
+            t_text = tkn.group()
+            if t_text and t_text[0] in self.PUNCT:
+                before = re.match(r"((.))", t_text[0])
+                tokens.append(before)
+                if len(t_text.strip()) > 1:
+                    after = re.match(self.REGEX, t_text[1:])
+                    tokens.append(after)
+            else:
+                tokens.append(tkn)
         doc = Document(text)
         for tkn_index, match in enumerate(tokens):
             start_index = match.start()

From 0f4b7f369a5490c45db1a51d5d6441bfbfa0e339 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 16:34:10 +0100
Subject: [PATCH 04/13] CU-869ag0tqj: Use compiled regex for regex-based
 tokenizer

---
 medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 808276d67..5c7e1ac80 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -324,8 +324,8 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken],
 
 
 class RegexTokenizer(BaseTokenizer):
-    PUNCT = "'\"-_.,:;()[]{}<>*&^%$@!?|\\/+="
-    REGEX = r'((\b\w+\b|\S+)\s?)'
+    PUNCT_REGEX = re.compile(r'[^a-zA-Z0-9]+')
+    REGEX = re.compile(r'((\b\w+\b|\S+)\s?)')
     # group 1: text with whitespace (if present)
     # group 2: text with no whitespace
 
@@ -349,11 +349,11 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
     def __call__(self, text: str) -> MutableDocument:
-        _tokens = re.finditer(self.REGEX, text)
+        _tokens = self.REGEX.finditer(text)
         tokens: list[re.Match[str]] = []
         for tkn in _tokens:
             t_text = tkn.group()
-            if t_text and t_text[0] in self.PUNCT:
+            if t_text and self.PUNCT_REGEX.match(t_text[0]):
                 before = re.match(r"((.))", t_text[0])
                 tokens.append(before)
                 if len(t_text.strip()) > 1:

From 7ebcb8701fc96a122d2738b81021a9ed3c04ae4b Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 16:36:36 +0100
Subject: [PATCH 05/13] CU-869ag0tqj: Fix small typing issues

---
 medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 5c7e1ac80..1d4df8eec 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -355,9 +355,17 @@ def __call__(self, text: str) -> MutableDocument:
             t_text = tkn.group()
             if t_text and self.PUNCT_REGEX.match(t_text[0]):
                 before = re.match(r"((.))", t_text[0])
+                if before is None:
+                    raise ValueError(
+                        "Got an unmatched character somehow (before): "
+                        f"'{t_text[0]}'")
                 tokens.append(before)
                 if len(t_text.strip()) > 1:
                     after = re.match(self.REGEX, t_text[1:])
+                    if after is None:
+                        raise ValueError(
+                            "Got an unmatched character somehow (after): "
+                            f"'{t_text[1:]}'")
                     tokens.append(after)
             else:
                 tokens.append(tkn)

From cde7e3495de5492bd4a38f9b08dcc5ea4372e5c0 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 16:37:11 +0100
Subject: [PATCH 06/13] CU-869ag0tqj: Add a small comment

---
 medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 1d4df8eec..d7add3e85 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -356,6 +356,7 @@ def __call__(self, text: str) -> MutableDocument:
             if t_text and self.PUNCT_REGEX.match(t_text[0]):
                 before = re.match(r"((.))", t_text[0])
                 if before is None:
+                    # NOTE: explicitly cannot happen since anything goes
                     raise ValueError(
                         "Got an unmatched character somehow (before): "
                         f"'{t_text[0]}'")
@@ -363,6 +364,7 @@ def __call__(self, text: str) -> MutableDocument:
                 if len(t_text.strip()) > 1:
                     after = re.match(self.REGEX, t_text[1:])
                     if after is None:
+                        # NOTE: explicitly cannot happen since there's a check
                         raise ValueError(
                             "Got an unmatched character somehow (after): "
                             f"'{t_text[1:]}'")

From faf2e2c9f082c72c57f07b047ddc00013f97502a Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Mon, 15 Sep 2025 16:56:40 +0100
Subject: [PATCH 07/13] CU-869ag0tqj: Update regex-based tokenizer tests

---
 medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
index e30f70821..718b9202d 100644
--- a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
+++ b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
@@ -8,7 +8,7 @@ class TokenizerTests(TestCase):
     TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! "
                    "And then some!")
     EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32",
-                  "numbers", "2", "-tokenize!", "And", "then", "some", "!"]
+                  "numbers", "2", "-", "tokenize", "And", "then", "some", "!"]
     BIG_NUMBER = 10_000_000
 
     @classmethod

From b99bedc8d6b5f6211826377ddb09de29889458e5 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Thu, 18 Sep 2025 09:43:09 +0100
Subject: [PATCH 08/13] CU-869ag0tqj: Refactor token getting in regex tokenizer
 somewhat

---
 medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index d7add3e85..7dd4f3f29 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -348,7 +348,7 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         end_index = doc._tokens.index(tokens[-1])
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
-    def __call__(self, text: str) -> MutableDocument:
+    def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
         _tokens = self.REGEX.finditer(text)
         tokens: list[re.Match[str]] = []
         for tkn in _tokens:
@@ -371,6 +371,10 @@ def __call__(self, text: str) -> MutableDocument:
                     tokens.append(after)
             else:
                 tokens.append(tkn)
+        return tokens
+
+    def __call__(self, text: str) -> MutableDocument:
+        tokens = self._get_tokens_matches(text)
         doc = Document(text)
         for tkn_index, match in enumerate(tokens):
             start_index = match.start()

From e826e219d5f5768bdd04f8f1f7bb0a51e2b4c053 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Thu, 18 Sep 2025 09:46:09 +0100
Subject: [PATCH 09/13] CU-869ag0tqj: Separate punctuation getting for regex
 tokenizing

---
 medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 7dd4f3f29..b27653289 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -348,8 +348,9 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         end_index = doc._tokens.index(tokens[-1])
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
-    def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
-        _tokens = self.REGEX.finditer(text)
+    def _split_punctuation_into_separate_matches(
+            self, _tokens: Iterator[re.Match[str]]
+            ) -> list[re.Match[str]]:
         tokens: list[re.Match[str]] = []
         for tkn in _tokens:
             t_text = tkn.group()
@@ -373,6 +374,10 @@ def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
                 tokens.append(tkn)
         return tokens
 
+    def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
+        tokens = self.REGEX.finditer(text)
+        return self._split_punctuation_into_separate_matches(tokens)
+
     def __call__(self, text: str) -> MutableDocument:
         tokens = self._get_tokens_matches(text)
         doc = Document(text)

From e037bbded27959cfb2d431d01566dde36d72ed6d Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Thu, 18 Sep 2025 13:38:42 +0100
Subject: [PATCH 10/13] CU-869ag0tqj: Add some further comments in code

---
 medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index b27653289..0990eaf4c 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -354,7 +354,9 @@ def _split_punctuation_into_separate_matches(
         tokens: list[re.Match[str]] = []
         for tkn in _tokens:
             t_text = tkn.group()
+            # checking if first character is punctuation
             if t_text and self.PUNCT_REGEX.match(t_text[0]):
+                # if it is, then separate it to a separate Match object
                 before = re.match(r"((.))", t_text[0])
                 if before is None:
                     # NOTE: explicitly cannot happen since anything goes
@@ -363,6 +365,9 @@ def _split_punctuation_into_separate_matches(
                         f"'{t_text[0]}'")
                 tokens.append(before)
                 if len(t_text.strip()) > 1:
+                    # if there's something other than the first element
+                    # i.e more than just the punctuation
+                    # use the rest as a separate match
                     after = re.match(self.REGEX, t_text[1:])
                     if after is None:
                         # NOTE: explicitly cannot happen since there's a check

From 0ec0c7a4b40a46945fc5159eccc253bdd9b64293 Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 19 Sep 2025 09:06:27 +0100
Subject: [PATCH 11/13] CU-869ag0tqj: Fix regex tokenizer expected results

---
 medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
index 718b9202d..0d5507826 100644
--- a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
+++ b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
@@ -8,7 +8,8 @@ class TokenizerTests(TestCase):
     TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! "
                    "And then some!")
     EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32",
-                  "numbers", "2", "-", "tokenize", "And", "then", "some", "!"]
+                  "numbers", "2", "-", "tokenize", "!", "And", "then", "some",
+                  "!"]
     BIG_NUMBER = 10_000_000
 
     @classmethod

From c6963cf53475b144b0e17d73a3149097a1b4661b Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 19 Sep 2025 09:07:03 +0100
Subject: [PATCH 12/13] CU-869ag0tqj: Simplify regex for punctuation separation
 in regex tokenizer

---
 .../medcat/tokenizing/regex_impl/tokenizer.py | 36 ++-----------------
 1 file changed, 2 insertions(+), 34 deletions(-)

diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
index 0990eaf4c..01b60b016 100644
--- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
+++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
@@ -324,8 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken],
 
 
 class RegexTokenizer(BaseTokenizer):
-    PUNCT_REGEX = re.compile(r'[^a-zA-Z0-9]+')
-    REGEX = re.compile(r'((\b\w+\b|\S+)\s?)')
+    REGEX = re.compile(r'(([^a-zA-Z0-9\s]+|\b\w+\b|\S+)\s?)')
     # group 1: text with whitespace (if present)
     # group 2: text with no whitespace
 
@@ -348,40 +347,9 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
         end_index = doc._tokens.index(tokens[-1])
         return _entity_from_tokens(doc, tokens, start_index, end_index)
 
-    def _split_punctuation_into_separate_matches(
-            self, _tokens: Iterator[re.Match[str]]
-            ) -> list[re.Match[str]]:
-        tokens: list[re.Match[str]] = []
-        for tkn in _tokens:
-            t_text = tkn.group()
-            # checking if first character is punctuation
-            if t_text and self.PUNCT_REGEX.match(t_text[0]):
-                # if it is, then separate it to a separate Match object
-                before = re.match(r"((.))", t_text[0])
-                if before is None:
-                    # NOTE: explicitly cannot happen since anything goes
-                    raise ValueError(
-                        "Got an unmatched character somehow (before): "
-                        f"'{t_text[0]}'")
-                tokens.append(before)
-                if len(t_text.strip()) > 1:
-                    # if there's something other than the first element
-                    # i.e more than just the punctuation
-                    # use the rest as a separate match
-                    after = re.match(self.REGEX, t_text[1:])
-                    if after is None:
-                        # NOTE: explicitly cannot happen since there's a check
-                        raise ValueError(
-                            "Got an unmatched character somehow (after): "
-                            f"'{t_text[1:]}'")
-                    tokens.append(after)
-            else:
-                tokens.append(tkn)
-        return tokens
-
     def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
         tokens = self.REGEX.finditer(text)
-        return self._split_punctuation_into_separate_matches(tokens)
+        return list(tokens)
 
     def __call__(self, text: str) -> MutableDocument:
         tokens = self._get_tokens_matches(text)

From 6a7e21a889519dd7de9e93c5a142d29c180c991c Mon Sep 17 00:00:00 2001
From: mart-r <mart.ratas@gmail.com>
Date: Fri, 19 Sep 2025 10:20:43 +0100
Subject: [PATCH 13/13] CU-869ag0tqj: Add explicit list of expected names in
 CDB maker test with dashes

---
 .../tests/model_creation/test_cdb_maker.py    | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py
index 79618c76d..4fc5e4415 100644
--- a/medcat-v2/tests/model_creation/test_cdb_maker.py
+++ b/medcat-v2/tests/model_creation/test_cdb_maker.py
@@ -38,6 +38,15 @@ class MakeWithDashes(CDBMakerBaseTests):
     namelist = ["Korsakoff's psychosis",
                 'Wernicke-Korsakoff syndrome',
                 'Korsakov syndrome - alcoholic']
+    expected_names = [
+        # NOTE: whitespace and punctuation (e.g spaces, dashes)
+        #       are replaced with separator (~) here
+        #       and names are lower case
+        #       notably, only 1 separator at a time is shown
+        "korsakoff~s~psychosis",
+        "wernicke~korsakoff~syndrome",
+        "korsakov~syndrome~alcoholic",
+    ]
     cui_df = pd.DataFrame({'cui': cui, 'name': namelist})
 
     @classmethod
@@ -49,16 +58,8 @@ def test_has_cui(self):
         self.assertIn(self.cui, self.cdb.cui2info)
 
     def test_has_full_names(self):
-        sep = self.config.general.separator
-        for _name in self.namelist:
-            # lowercase and pre-condition
-            name = _name.lower()
-            name = name.replace(" ", sep)
-            name = name.replace("-", sep)
-            name = name.replace("'", sep)
-            name = name.replace(sep * 2, sep)
-            name = name.replace(sep * 2, sep)
-            with self.subTest(f"Name: {_name} ({name})"):
+        for name in self.expected_names:
+            with self.subTest(f"Name: {name}"):
                 self.assertIn(name, self.cdb.name2info.keys())