Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions medcat-v2/medcat/tokenizing/regex_impl/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def _entity_from_tokens(doc: Document, tokens: list[MutableToken],


class RegexTokenizer(BaseTokenizer):
REGEX = r'((\b\w+\b|\S+)\s?)'
REGEX = re.compile(r'(([^a-zA-Z0-9\s]+|\b\w+\b|\S+)\s?)')
# group 1: text with whitespace (if present)
# group 2: text with no whitespace

Expand All @@ -347,8 +347,12 @@ def entity_from_tokens(self, tokens: list[MutableToken]) -> MutableEntity:
end_index = doc._tokens.index(tokens[-1])
return _entity_from_tokens(doc, tokens, start_index, end_index)

def _get_tokens_matches(self, text: str) -> list[re.Match[str]]:
tokens = self.REGEX.finditer(text)
return list(tokens)

def __call__(self, text: str) -> MutableDocument:
tokens = re.finditer(self.REGEX, text)
tokens = self._get_tokens_matches(text)
doc = Document(text)
for tkn_index, match in enumerate(tokens):
start_index = match.start()
Expand Down
40 changes: 39 additions & 1 deletion medcat-v2/tests/model_creation/test_cdb_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import logging
import os
import numpy as np
import pandas as pd
from medcat.model_creation.cdb_maker import CDBMaker
from medcat.cdb import CDB
from medcat.config import Config
Expand All @@ -15,12 +16,15 @@


class CDBMakerBaseTests(unittest.TestCase):
use_spacy = False

@classmethod
def setUpClass(cls):
cls.config = Config()
cls.config.general.log_level = logging.DEBUG
cls.config.general.nlp.modelname = "en_core_web_md"
if cls.use_spacy:
cls.config.general.nlp.provider = 'spacy'
cls.config.general.nlp.modelname = "en_core_web_md"
cls.maker = CDBMaker(cls.config)
csvs = [
os.path.join(MODEL_CREATION_RES_PATH, 'cdb.csv'),
Expand All @@ -29,6 +33,40 @@ def setUpClass(cls):
cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)


class MakeWithDashes(CDBMakerBaseTests):
cui = '69482004'
namelist = ["Korsakoff's psychosis",
'Wernicke-Korsakoff syndrome',
'Korsakov syndrome - alcoholic']
expected_names = [
# NOTE: whitespace and punctuation (e.g spaces, dashes)
# are replaced with separator (~) here
# and names are lower case
# notably, only 1 separator at a time is shown
"korsakoff~s~psychosis",
"wernicke~korsakoff~syndrome",
"korsakov~syndrome~alcoholic",
]
cui_df = pd.DataFrame({'cui': cui, 'name': namelist})

@classmethod
def setUpClass(cls):
super().setUpClass()
cls.maker.prepare_csvs([cls.cui_df, ], full_build=True)

def test_has_cui(self):
self.assertIn(self.cui, self.cdb.cui2info)

def test_has_full_names(self):
for name in self.expected_names:
with self.subTest(f"Name: {name}"):
self.assertIn(name, self.cdb.name2info.keys())


class MakeWithDashesSpacy(MakeWithDashes):
use_spacy = True


class CDBMakerLoadTests(CDBMakerBaseTests):
EXPECTED_NAMES = {
'C0000039': {'virus~k', 'virus', 'virus~m', 'virus~z'},
Expand Down
3 changes: 2 additions & 1 deletion medcat-v2/tests/tokenizing/regex_impl/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ class TokenizerTests(TestCase):
TEXT_SIMPLE = ("This is - some simple test and 32 numbers 2-tokenize! "
"And then some!")
EXP_TOKENS = ["This", "is", "-", "some", "simple", "test", "and", "32",
"numbers", "2", "-tokenize!", "And", "then", "some", "!"]
"numbers", "2", "-", "tokenize", "!", "And", "then", "some",
"!"]
BIG_NUMBER = 10_000_000

@classmethod
Expand Down
Loading