Skip to content

Commit

Permalink
Merge pull request #48 from CogStack/develop
Browse files Browse the repository at this point in the history
Importing old CDB and more control over batch_size in multiproc
  • Loading branch information
w-is-h committed Apr 10, 2021
2 parents d3c46ae + 1e47365 commit 7ce8f42
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 9 deletions.
26 changes: 21 additions & 5 deletions medcat/cat.py
Expand Up @@ -719,12 +719,13 @@ def get_json(self, text, only_cui=False, addl_info=['cui2icd10', 'cui2ontologies
return json.dumps(out)


def multiprocessing(self, in_data, nproc=8, batch_size=100, only_cui=False, addl_info=[]):
def multiprocessing(self, in_data, nproc=8, batch_size_chars=1000000, max_chars_in_memory=-1, only_cui=False, addl_info=[]):
r''' Run multiprocessing NOT FOR TRAINING
in_data: an iterator or array with format: [(id, text), (id, text), ...]
nproc: number of processors
batch_size: obvious
batch_size_chars: size of a batch in number of characters
max_chars_in_memory: if set it will limit the number of chars that can be processed together by all processed combined
return: an list of tuples: [(id, doc_json), (id, doc_json), ...]
'''
Expand All @@ -738,21 +739,25 @@ def multiprocessing(self, in_data, nproc=8, batch_size=100, only_cui=False, addl
manager = Manager()
out_dict = manager.dict()
out_dict['processed'] = []
out_dict['current_size_in_chars'] = 0

# Create processes
procs = []
for i in range(nproc):
p = Process(target=self._mp_cons, kwargs={'in_q': in_q, 'out_dict': out_dict, 'pid': i, 'only_cui': only_cui,
'addl_info': addl_info})
'addl_info': addl_info, 'max_chars_in_memory': max_chars_in_memory})
p.start()
procs.append(p)

data = []
nchars = 0
for id, text in in_data:
data.append((id, str(text)))
if len(data) == batch_size:
nchars += len(str(text))
if nchars >= batch_size_chars:
in_q.put(data)
data = []
nchars = 0
# Put the last batch if it exists
if len(data) > 0:
in_q.put(data)
Expand All @@ -779,7 +784,7 @@ def multiprocessing(self, in_data, nproc=8, batch_size=100, only_cui=False, addl
return out


def _mp_cons(self, in_q, out_dict, pid=0, only_cui=False, addl_info=[]):
def _mp_cons(self, in_q, out_dict, pid=0, only_cui=False, addl_info=[], max_chars_in_memory=-1):
cnt = 0
out = []
while True:
Expand All @@ -791,7 +796,18 @@ def _mp_cons(self, in_q, out_dict, pid=0, only_cui=False, addl_info=[]):

for id, text in data:
try:
if max_chars_in_memory > 0:
while out_dict['current_size_in_chars'] > max_chars_in_memory:
self.log.debug("Process: {}, waiting because current chars in memory: {}".format(pid, out_dict['current_size_in_chars']))
# Wait until size in memory is low
sleep(1)
out_dict['current_size_in_chars'] = out_dict['current_size_in_chars'] + len(text)
self.log.debug("Process: {}, current chars in memory (before) is: {}".format(pid, out_dict['current_size_in_chars']))
# Annotate document
doc = self.get_entities(text=text, only_cui=only_cui, addl_info=addl_info)

out_dict['current_size_in_chars'] = out_dict['current_size_in_chars'] - len(text)
self.log.debug("Process: {}, current chars in memory (after) is: {}".format(pid, out_dict['current_size_in_chars']))
doc['text'] = text
out.append((id, doc))
except Exception as e:
Expand Down
5 changes: 3 additions & 2 deletions medcat/cdb.py
Expand Up @@ -380,12 +380,13 @@ def import_old_cdb_vectors(self, cdb):
self.cui2count_train[cui] = cdb.cui_count[cui]


def import_old_cdb(self, cdb):
def import_old_cdb(self, cdb, import_vectors=True):
r''' Import all data except for cuis and names from an old CDB.
'''

# Import vectors
self.import_old_cdb_vectors(cdb)
if import_vectors:
self.import_old_cdb_vectors(cdb)

# Import TUIs
for cui in cdb.cui2names:
Expand Down
3 changes: 2 additions & 1 deletion medcat/config.py
Expand Up @@ -29,7 +29,8 @@ def __init__(self):
self.general = {
# Logging config for everything | 'tagger' can be disabled, but will cause a drop in performance
'log_level': logging.INFO,
'log_format': '%(asctime)s: %(message)s',
'log_format': '%(levelname)s:%(name)s: %(message)s',
'log_path': './medcat.log',
'spacy_disabled_components': ['ner', 'parser', 'vectors', 'textcat',
'entity_linker', 'sentencizer', 'entity_ruler', 'merge_noun_chunks',
'merge_entities', 'merge_subtokens'],
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -9,7 +9,7 @@

setuptools.setup(
name="medcat",
version="1.0.6",
version="1.0.8",
author="w-is-h",
author_email="w.kraljevic@gmail.com",
description="Concept annotation tool for Electronic Health Records",
Expand Down

0 comments on commit 7ce8f42

Please sign in to comment.