diff --git a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py index 874f51dcb..01b60b016 100644 --- a/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py +++ b/medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py @@ -324,7 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken], class RegexTokenizer(BaseTokenizer): - REGEX = r'((\b\w+\b|\S+)\s?)' + REGEX = re.compile(r'(([^a-zA-Z0-9\s]+|\b\w+\b|\S+)\s?)') # group 1: text with whitespace (if present) # group 2: text with no whitespace @@ -347,8 +347,12 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity: end_index = doc._tokens.index(tokens[-1]) return _entity_from_tokens(doc, tokens, start_index, end_index) + def _get_tokens_matches(self, text: str) -> list[re.Match[str]]: + tokens = self.REGEX.finditer(text) + return list(tokens) + def __call__(self, text: str) -> MutableDocument: - tokens = re.finditer(self.REGEX, text) + tokens = self._get_tokens_matches(text) doc = Document(text) for tkn_index, match in enumerate(tokens): start_index = match.start() diff --git a/medcat-v2/tests/model_creation/test_cdb_maker.py b/medcat-v2/tests/model_creation/test_cdb_maker.py index daec0a4e1..4fc5e4415 100644 --- a/medcat-v2/tests/model_creation/test_cdb_maker.py +++ b/medcat-v2/tests/model_creation/test_cdb_maker.py @@ -2,6 +2,7 @@ import logging import os import numpy as np +import pandas as pd from medcat.model_creation.cdb_maker import CDBMaker from medcat.cdb import CDB from medcat.config import Config @@ -15,12 +16,15 @@ class CDBMakerBaseTests(unittest.TestCase): + use_spacy = False @classmethod def setUpClass(cls): cls.config = Config() cls.config.general.log_level = logging.DEBUG - cls.config.general.nlp.modelname = "en_core_web_md" + if cls.use_spacy: + cls.config.general.nlp.provider = 'spacy' + cls.config.general.nlp.modelname = "en_core_web_md" cls.maker = CDBMaker(cls.config) csvs = [ os.path.join(MODEL_CREATION_RES_PATH, 'cdb.csv'), @@ -29,6 +33,40 @@ def setUpClass(cls): cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) +class MakeWithDashes(CDBMakerBaseTests): + cui = '69482004' + namelist = ["Korsakoff's psychosis", + 'Wernicke-Korsakoff syndrome', + 'Korsakov syndrome - alcoholic'] + expected_names = [ + # NOTE: whitespace and punctuation (e.g spaces, dashes) + # are replaced with separator (~) here + # and names are lower case + # notably, only 1 separator at a time is shown + "korsakoff~s~psychosis", + "wernicke~korsakoff~syndrome", + "korsakov~syndrome~alcoholic", + ] + cui_df = pd.DataFrame({'cui': cui, 'name': namelist}) + + @classmethod + def setUpClass(cls): + super().setUpClass() + cls.maker.prepare_csvs([cls.cui_df, ], full_build=True) + + def test_has_cui(self): + self.assertIn(self.cui, self.cdb.cui2info) + + def test_has_full_names(self): + for name in self.expected_names: + with self.subTest(f"Name: {name}"): + self.assertIn(name, self.cdb.name2info.keys()) + + +class MakeWithDashesSpacy(MakeWithDashes): + use_spacy = True + + class CDBMakerLoadTests(CDBMakerBaseTests): EXPECTED_NAMES = { 'C0000039': {'virus~k', 'virus', 'virus~m', 'virus~z'}, diff --git a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py index e30f70821..0d5507826 100644 --- a/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py +++ b/medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py @@ -8,7 +8,8 @@ class TokenizerTests(TestCase): TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! " "And then some!") EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32", - "numbers", "2", "-tokenize!", "And", "then", "some", "!"] + "numbers", "2", "-", "tokenize", "!", "And", "then", "some", + "!"] BIG_NUMBER = 10_000_000 @classmethod