-
Notifications
You must be signed in to change notification settings - Fork 98
/
prepare_cdb.py
229 lines (195 loc) · 9.6 KB
/
prepare_cdb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
""" Prepartion classes for UMLS data in csv or other formats
"""
import pandas
import spacy
from spacy.tokenizer import Tokenizer
from medcat.cdb import CDB
from medcat.preprocessing.tokenizers import spacy_split_all
from medcat.preprocessing.cleaners import spacy_tag_punct, clean_name, clean_def
from spacy.tokens import Token
from medcat.utils.spacy_pipe import SpacyPipe
#from pytorch_pretrained_bert import BertTokenizer
import numpy as np
from functools import partial
# Check scispacy models
from medcat.utils.helpers import check_scispacy
check_scispacy()
class PrepareCDB(object):
""" Prepares CDB data in csv format for annotations,
after everything is done the result is in the cdb field.
"""
SEPARATOR = ""
NAME_SEPARATOR = "|"
CONCEPT_LENGTH_LIMIT = 20
SKIP_STOPWORDS = False
# It is important that CLEAN is last
VERSIONS = ['RAW', 'CLEAN']
def __init__(self, vocab=None, pretrained_cdb=None, word_tokenizer=None):
self.vocab = vocab
if pretrained_cdb is None:
self.cdb = CDB()
else:
self.cdb = pretrained_cdb
# Build the required spacy pipeline
self.nlp = SpacyPipe(spacy_split_all, disable=['ner', 'parser'])
self.nlp.add_punct_tagger(tagger=partial(spacy_tag_punct, skip_stopwords=self.SKIP_STOPWORDS))
# Get the tokenizer
if word_tokenizer is not None:
self.tokenizer = word_tokenizer
else:
self.tokenizer = self._tok
def _tok(self, text):
return [text]
def prepare_csvs(self, csv_paths, sep=',', encoding=None, escapechar=None, only_existing=False,
add_cleaner=None, only_new=False):
""" Compile one or multiple CSVs into an internal CDB class
csv_paths: an array of paths to the csv files that should be processed
sep: if necessarya a custom separator for the csv files
return: Compiled CDB class
"""
_new_cuis = set()
for csv_path in csv_paths:
df = pandas.read_csv(csv_path, sep=sep, encoding=encoding, escapechar=escapechar, index_col=False)
cols = list(df.columns)
str_ind = cols.index('str')
cui_ind = cols.index('cui')
tui_ind = -1
if 'tui' in cols:
tui_ind = cols.index('tui')
tui_name_ind = -1
if 'sty' in cols:
tui_name_ind = cols.index('sty')
tty_ind = -1
if 'tty' in cols:
tty_ind = cols.index('tty')
desc_ind = -1
if 'desc' in cols:
desc_ind = cols.index('desc')
onto_ind = -1
if 'onto' in cols:
onto_ind = cols.index('onto')
is_unique_ind = -1
if 'is_unique' in cols:
is_unique_ind = cols.index('is_unique')
examples_ind = -1
if 'examples' in cols:
examples_ind = cols.index('examples')
for ind in range(len(df)):
names = str(df.iat[ind, str_ind]).split(self.NAME_SEPARATOR)
if ind % 10000 == 0:
print("Done: {}".format(ind))
for _name in names:
skip_raw = False
for version in self.VERSIONS:
# Get the cui
cui = str(df.iat[ind, cui_ind])
if only_new:
# Add only new concepts, skip exisitng ones
#_tmp_name = clean_name(_name).lower().replace(" ", "")
if (cui in self.cdb.cui2names and cui not in _new_cuis): #and _tmp_name in self.cdb.name2cui:
continue
else:
if cui not in self.cdb.cui2names:
_new_cuis.add(cui)
if (version == "RAW" and skip_raw) or \
(only_existing and cui not in self.cdb.cui2names):
continue
# Save originals
pretty_name = _name
original_name = _name
name = _name
if version == "CLEAN" and add_cleaner is not None:
name = add_cleaner(name)
name = clean_name(name)
# Clean and preprocess the name
sc_name = self.nlp(name)
if version == 'CLEAN':
tokens = [str(t.lemma_).lower() for t in sc_name if not t._.is_punct
and not t._.to_skip]
elif version == 'RAW':
tokens = [str(t.lower_) for t in sc_name if not t._.is_punct
and not t._.to_skip]
tokens_vocab = [t.lower_ for t in sc_name if not t._.is_punct]
# Don't allow concept names to be above concept_length_limit
if len(tokens) > self.CONCEPT_LENGTH_LIMIT:
continue
name = self.SEPARATOR.join(tokens)
tmp_name = "".join(tokens)
if add_cleaner is None and name == self.SEPARATOR.join(tokens_vocab):
# Both names are the same, skip raw version
skip_raw = True
is_pref_name = False
if 'tty' in df.columns:
_tmp = str(df.iat[ind, tty_ind])
if _tmp.lower().strip() == 'pn':
is_pref_name = True
# Skip concepts are digits or each token is a single letter
length_one = [True if len(x) < 2 else False for x in tokens]
if tmp_name.isdigit() or all(length_one):
continue
# Create snames of the name
snames = []
sname = ""
for token in tokens:
sname = sname + token + self.SEPARATOR
snames.append(sname.strip())
# Check is unique
is_unique = None
if 'is_unique' in df.columns:
_tmp = str(df.iat[ind, is_unique_ind]).strip()
if _tmp.lower().strip() == '0':
is_unique = False
elif _tmp.lower().strip() == '1':
is_unique = True
# Get the ontology: 'sab' in umls
onto = 'default'
if 'onto' in df.columns:
# Get the ontology
onto = str(df.iat[ind, onto_ind])
# Get the tui
tui = None
if 'tui' in df.columns:
_tui = str(df.iat[ind, tui_ind]).strip()
if len(_tui) > 0 and _tui != "nan":
tui = _tui
#TODO: If there are multiple tuis just take the first one
if len(tui.split(',')) > 1:
tui = tui.split(',')[0]
tui_name = None
if 'sty' in df.columns:
_sty = str(df.iat[ind, tui_name_ind]).strip()
if len(_sty) > 0 and _sty != "nan":
tui_name = _sty
# Get the concept description
desc = None
if 'desc' in df.columns:
_desc = str(df.iat[ind, desc_ind]).strip()
if len(_desc) > 0:
desc = _desc
# Add the concept
self.cdb.add_concept(cui, name, onto, tokens, snames,
tui=tui, pretty_name=pretty_name,
tokens_vocab=tokens_vocab, is_unique=is_unique,
desc=desc, original_name=original_name,
is_pref_name=is_pref_name, tui_name=tui_name)
# Process examples if we have them
examples = []
if 'examples' in df.columns:
tmp = str(df.iat[ind, examples_ind]).strip().split(self.NAME_SEPARATOR)
for example in tmp:
example = example.strip()
if len(example) > 0:
examples.append(example)
# If we have examples
for example in examples:
doc = self.nlp(example)
cntx = []
for word in doc:
if not word._.to_skip:
for w in self.tokenizer(word.lower_):
if w in self.vocab and self.vocab.vec(w) is not None:
cntx.append(self.vocab.vec(w))
if len(cntx) > 1:
cntx = np.average(cntx, axis=0)
self.cdb.add_context_vec(cui, cntx, cntx_type='MED')
return self.cdb