In [1]:
# Generation of updated vocabulary files for minimap

In [2]:
import pandas.io.sql as sqlio
import mysql.connector
import tqdm
import csv
import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
import pandas as pd
import tqdm
from collections import defaultdict
import pickle

In [3]:
cnx = mysql.connector.connect(user='root', database='umls')

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
# regular expressions and text processing functions

import re

with open('../robotreviewer/data/minimap/prepositions_conjunctions.txt', 'r') as f:
    prep_conj = [l.strip() for l in f]

prep_conj_re = re.compile(r'\b({})\b'.format('|'.join(prep_conj)))
nos_ignore = re.compile(r'\bNOS\b') # note do after lowercase
pos_ignore = re.compile(r"(?<=\w)(\'s?)\b")
left_paren = re.compile(r"^\[(X|V|D|M|EDTA|SO|Q)\]")
paren = re.compile(r"[\(\[]\w+[\)\]]")
strip_space = re.compile(r"\s+")

def remove_nos(text):
    return nos_ignore.sub(' ', text)

def remove_pos(text):
    return pos_ignore.sub('', text)

def syn_uninv(text):
    try:
        inversion_point = text.index(', ')
    except ValueError:
        # not found
        return text
    
    if inversion_point+2 == len(text):
        # i.e. if the ', ' is at the end of the string
        return text
    
    if prep_conj_re.search(text[inversion_point+2:]):
        return text
    else:
        return text[inversion_point+2:] + " " + text[:inversion_point]
    
def ne_parentheticals(text_str):
    text_str = left_paren.sub('', text_str)
    text_str = paren.sub('', text_str)
    return text_str


In [6]:
# pipelines

def minimap(text_str, chunks=False):
    return matcher(pipeline(text_str, umls_mode=False), chunks=chunks)


def pipeline(text_str, umls_mode=True):
        
    # 1. removal of parentheticals
    if umls_mode:
        text_str = ne_parentheticals(text_str)
    
    # hyphens to spaces
    text_str = text_str.replace('-', ' ')
    # 3. conversion to lowercase
    # text_str = text_str.lower()
    # 2. syntactic uninverstion
    if umls_mode:
        text_str = syn_uninv(text_str)
    # 4. stripping of possessives
    text_str = remove_pos(text_str)
    # strip NOS's
    if umls_mode:
        text_str = remove_nos(text_str)
    # last... remove any multiple spaces, or starting/ending with space
    text_str = strip_space.sub(' ', text_str)    
    text_str = text_str.strip()
    return text_str

In [7]:
# now CUI to preferred term map
# df = pd.read_csv('cui_str.csv', sep='\t')
#
df =  sqlio.read_sql_query("SELECT str, cui, sab from MRCONSO where sab in ('MSH', 'RXNORM', 'SNOMEDCT_US', 'MDR', 'ICD10', 'ATC') and LAT='ENG';", cnx)

In [9]:
# first generate str to CUI map

interesting_cuis = set(df.cui.values)

str_to_cui_full = defaultdict(list)

with open('umls_full_index.csv') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    for row in tqdm.tqdm(reader):
        if row['cui'] in interesting_cuis:
            # just keep those which are in the Cochrane vocabs
            doc = nlp(pipeline(row['str'], umls_mode=True).lower())
            str_to_cui_full[' '.join(t.lemma_ for t in doc)].append(row['cui'])

8340it [00:38, 226.85it/s]

KeyboardInterrupt: 

In [15]:
str_to_cui = {}
for k, v in str_to_cui_full.items():
    str_to_cui[k] = list(set(v))

In [16]:
import pickle
with open('str_to_cui.pck', 'wb') as f:
    pickle.dump(str_to_cui_full, f)

In [17]:


cui_to_pstr = defaultdict(dict)
for i, r in tqdm.tqdm(df.iterrows()):
    cui_to_pstr[r['cui']][r['sab']] = r['str']
order = ["RXNORM", "MSH", "SNOMEDCT_US", "ICD10", "MDR", "ATC"]



0it [00:00, ?it/s][A[A

928it [00:00, 9279.26it/s][A[A

2032it [00:00, 9743.66it/s][A[A

3086it [00:00, 9969.61it/s][A[A

4165it [00:00, 10201.27it/s][A[A

5235it [00:00, 10343.94it/s][A[A

6229it [00:00, 10216.61it/s][A[A

7305it [00:00, 10373.39it/s][A[A

8460it [00:00, 10699.64it/s][A[A

9593it [00:00, 10880.88it/s][A[A

10724it [00:01, 11005.22it/s][A[A

11802it [00:01, 10794.97it/s][A[A

12867it [00:01, 10584.13it/s][A[A

13977it [00:01, 10731.24it/s][A[A

15130it [00:01, 10956.90it/s][A[A

16250it [00:01, 11027.96it/s][A[A

17359it [00:01, 11045.72it/s][A[A

18463it [00:01, 11035.95it/s][A[A

19576it [00:01, 11063.94it/s][A[A

20723it [00:01, 11181.00it/s][A[A

21864it [00:02, 11246.73it/s][A[A

23020it [00:02, 11337.88it/s][A[A

24154it [00:02, 11061.46it/s][A[A

25276it [00:02, 11108.16it/s][A[A

26417it [00:02, 11196.00it/s][A[A

27544it [00:02, 11216.02it/s][A[A

28681it [00:02, 11261.71it/s][A[A

29808it [00:02, 11077.0

486642it [00:43, 10942.42it/s][A[A

487775it [00:43, 11053.46it/s][A[A

488881it [00:44, 10907.37it/s][A[A

489985it [00:44, 10944.55it/s][A[A

491136it [00:44, 11108.26it/s][A[A

492271it [00:44, 11177.81it/s][A[A

493410it [00:44, 11238.73it/s][A[A

494535it [00:44, 11155.10it/s][A[A

495652it [00:44, 11056.68it/s][A[A

496786it [00:44, 11138.70it/s][A[A

497912it [00:44, 11173.94it/s][A[A

499061it [00:44, 11266.79it/s][A[A

500189it [00:45, 11247.48it/s][A[A

501315it [00:45, 11081.35it/s][A[A

502431it [00:45, 11098.76it/s][A[A

503570it [00:45, 11182.55it/s][A[A

504720it [00:45, 11274.27it/s][A[A

505855it [00:45, 11294.82it/s][A[A

506985it [00:45, 11141.11it/s][A[A

508100it [00:45, 11035.35it/s][A[A

509220it [00:45, 11082.51it/s][A[A

510360it [00:45, 11174.05it/s][A[A

511514it [00:46, 11279.19it/s][A[A

512643it [00:46, 11256.31it/s][A[A

513770it [00:46, 11038.10it/s][A[A

514919it [00:46, 11167.42it/s][A[A

516047it [00

970858it [01:27, 11081.00it/s][A[A

971967it [01:27, 11021.22it/s][A[A

973070it [01:27, 10936.53it/s][A[A

974206it [01:27, 11059.39it/s][A[A

975356it [01:27, 11186.51it/s][A[A

976499it [01:27, 11257.97it/s][A[A

977626it [01:28, 11227.11it/s][A[A

978750it [01:28, 11005.19it/s][A[A

979899it [01:28, 11144.02it/s][A[A

981015it [01:28, 11067.78it/s][A[A

982123it [01:28, 10996.22it/s][A[A

983253it [01:28, 11084.36it/s][A[A

984363it [01:28, 11054.30it/s][A[A

985494it [01:28, 11129.49it/s][A[A

986636it [01:28, 11212.86it/s][A[A

987775it [01:28, 11263.86it/s][A[A

988912it [01:29, 11294.14it/s][A[A

990053it [01:29, 11326.99it/s][A[A

991190it [01:29, 11337.46it/s][A[A

992324it [01:29, 11314.29it/s][A[A

993456it [01:29, 11041.37it/s][A[A

994562it [01:29, 11021.00it/s][A[A

995707it [01:29, 11144.95it/s][A[A

996840it [01:29, 11199.00it/s][A[A

997961it [01:29, 11185.83it/s][A[A

999081it [01:29, 11167.21it/s][A[A

1000199it [0

1441203it [02:10, 11195.00it/s][A[A

1442325it [02:10, 10873.07it/s][A[A

1443416it [02:10, 10790.24it/s][A[A

1444562it [02:10, 10980.53it/s][A[A

1445688it [02:10, 11062.70it/s][A[A

1446835it [02:10, 11180.94it/s][A[A

1447955it [02:10, 10925.25it/s][A[A

1449050it [02:10, 10769.47it/s][A[A

1450170it [02:11, 10895.05it/s][A[A

1451283it [02:11, 10963.35it/s][A[A

1452420it [02:11, 11080.44it/s][A[A

1453530it [02:11, 11043.91it/s][A[A

1454636it [02:11, 10927.24it/s][A[A

1455730it [02:11, 10917.42it/s][A[A

1456866it [02:11, 11045.34it/s][A[A

1457999it [02:11, 11126.82it/s][A[A

1459113it [02:11, 11022.34it/s][A[A

1460216it [02:11, 10873.03it/s][A[A

1461305it [02:12, 10687.74it/s][A[A

1462376it [02:12, 10033.69it/s][A[A

1463389it [02:12, 9528.05it/s] [A[A

1464517it [02:12, 9993.38it/s][A[A

1465551it [02:12, 10092.66it/s][A[A

1466589it [02:12, 10176.07it/s][A[A

1467687it [02:12, 10403.06it/s][A[A

1468734it [02:12, 10300.44

In [18]:
cui_to_str = {}

for k, v in cui_to_pstr.items():
    for p in order:
        if p in v:
            cui_to_str[k] = v[p]
            break
                
            

In [19]:
with open('cui_to_str.pck', 'wb') as f:
    pickle.dump(cui_to_str, f)

In [22]:
import networkx as nx

In [21]:
(graph_data.iterrows())

NameError: name 'graph_data' is not defined

In [23]:
graph_data = pd.read_csv('cui_graph.csv', sep='\t')
G = nx.DiGraph()
G.add_edges_from(((r['cui2'], r['cui1']) for i, r in tqdm.tqdm(graph_data.iterrows())))




0it [00:00, ?it/s][A[A

941it [00:00, 9408.01it/s][A[A

1942it [00:00, 9580.10it/s][A[A

2965it [00:00, 9766.06it/s][A[A

3979it [00:00, 9874.27it/s][A[A

5042it [00:00, 10084.75it/s][A[A

6103it [00:00, 10235.72it/s][A[A

7018it [00:00, 9599.07it/s] [A[A

8022it [00:00, 9726.80it/s][A[A

9081it [00:00, 9970.61it/s][A[A

10095it [00:01, 10020.26it/s][A[A

11179it [00:01, 10250.77it/s][A[A

12271it [00:01, 10442.59it/s][A[A

13344it [00:01, 10526.44it/s][A[A

14413it [00:01, 10572.66it/s][A[A

15496it [00:01, 10648.16it/s][A[A

16558it [00:01, 10505.13it/s][A[A

17607it [00:01, 10428.46it/s][A[A

18680it [00:01, 10516.45it/s][A[A

19782it [00:01, 10662.22it/s][A[A

20891it [00:02, 10786.63it/s][A[A

21971it [00:02, 10734.75it/s][A[A

23062it [00:02, 10786.06it/s][A[A

24141it [00:02, 10735.08it/s][A[A

25229it [00:02, 10776.00it/s][A[A

26307it [00:02, 10542.14it/s][A[A

27363it [00:02, 10375.59it/s][A[A

28403it [00:02, 10116.69it

446420it [00:45, 10307.58it/s][A[A

447465it [00:45, 10345.72it/s][A[A

448500it [00:45, 10331.67it/s][A[A

449581it [00:45, 10468.47it/s][A[A

450653it [00:45, 10541.72it/s][A[A

451708it [00:45, 10522.72it/s][A[A

452802it [00:45, 10642.38it/s][A[A

453867it [00:45, 10566.02it/s][A[A

454948it [00:46, 10637.21it/s][A[A

456013it [00:46, 10533.06it/s][A[A

457080it [00:46, 10571.98it/s][A[A

458138it [00:46, 10534.17it/s][A[A

459193it [00:46, 10537.97it/s][A[A

460264it [00:46, 10588.50it/s][A[A

461333it [00:46, 10618.65it/s][A[A

462396it [00:46, 10547.56it/s][A[A

463451it [00:46, 10486.04it/s][A[A

464500it [00:46, 10482.99it/s][A[A

465549it [00:47, 10481.23it/s][A[A

466607it [00:47, 10510.09it/s][A[A

467659it [00:47, 10495.04it/s][A[A

468717it [00:47, 10519.85it/s][A[A

469770it [00:47, 10413.56it/s][A[A

470812it [00:47, 10188.72it/s][A[A

471833it [00:47, 10160.98it/s][A[A

472888it [00:47, 10273.14it/s][A[A

473961it [00

913332it [01:29, 10522.39it/s][A[A

914403it [01:29, 10577.95it/s][A[A

915509it [01:29, 10717.41it/s][A[A

916616it [01:29, 10819.98it/s][A[A

917700it [01:30, 10800.45it/s][A[A

918781it [01:30, 10779.07it/s][A[A

919875it [01:30, 10824.98it/s][A[A

920974it [01:30, 10872.97it/s][A[A

922062it [01:30, 10857.07it/s][A[A

923185it [01:30, 10965.07it/s][A[A

924293it [01:30, 10998.39it/s][A[A

925394it [01:30, 10952.15it/s][A[A

926490it [01:30, 10923.64it/s][A[A

927597it [01:30, 10966.56it/s][A[A

928694it [01:31, 9859.68it/s] [A[A

929782it [01:31, 10144.05it/s][A[A

930868it [01:31, 10348.29it/s][A[A

931973it [01:31, 10548.72it/s][A[A

933069it [01:31, 10668.73it/s][A[A

934157it [01:31, 10729.22it/s][A[A

935248it [01:31, 10782.02it/s][A[A

936336it [01:31, 10808.55it/s][A[A

937420it [01:31, 10817.74it/s][A[A

938521it [01:31, 10874.71it/s][A[A

939630it [01:32, 10935.88it/s][A[A

940727it [01:32, 10943.81it/s][A[A

941834it [01

In [25]:
with open('cui_subtrees.pck', 'wb') as f:
    pickle.dump(G, f)

In [6]:
dat = sqlio.read_sql_query("SELECT str, cui from MRCONSO limit 10;", cnx)


In [7]:
dat

Unnamed: 0,str,cui
0,(131)I-Macroaggregated Albumin,C0000005
1,(131)I-MAA,C0000005
2,Macroagrégats d'albumine marquée à l'iode 131,C0000005
3,MAA-I 131,C0000005
4,Macroagrégats d'albumine humaine marquée à l'i...,C0000005
5,"1,2-dipalmitoylfosfatidylcholin",C0000039
6,"1,2-dipalmitoylphosphatidylcholine",C0000039
7,"1,2-dipalmitoylphosphatidylcholine",C0000039
8,"1,2-Dipalmitoylphosphatidylcholine",C0000039
9,"1,2 Dipalmitoylphosphatidylcholine",C0000039
