# UMLS

This code aims to create a KG composed both by **UMLS's Semantic Network Types** and **UMLS Methatesaurus**

## Importing

In [1]:
from genericpath import exists
import pymysql, csv
from argparse import ArgumentParser
import sys
from tqdm import tqdm

## Setting global variables

In [2]:
host = 'localhost'
user = 'root'
password = 'adm'
database = 'umls'

## Connecting to UMLS SQL Database

In [3]:
print('Connect Database')
conn = pymysql.connect(host= host, user= user,
                        password= password, database= database)
cursor= conn.cursor()

Connect Database


## Processing UMLS SN Types

In [4]:
print('Process UMLS Types')
exists_types = set()
out= open('SRDEF.processed.csv', 'w', encoding= 'utf-8')
writer= csv.writer(out)
cols= ['STY:ID', 'id', ':LABEL',  'name', 'DEF', 'ABR']
writer.writerow(cols)
cursor.execute("select UI, STY_RL, DEF, ABR from umls.SRDEF where RT = 'STY';")
mrsty = cursor.fetchall()

Process UMLS Types


In [5]:
for line in mrsty: 
    if line[0] in exists_types: 
        continue
    writer.writerow([line[0],line[0], 'SemanticType', line[1],line[2],line[3]])
    exists_types.add(line[0])
print(f'{len(exists_types)} Semantic Types')

127 Semantic Types


## Processing UMLS Concepts

In [6]:
print('Process Concepts')
exists_concept= set()
out= open('MRCONSO.processed.csv', 'w', encoding= 'utf-8')
writer= csv.writer(out)
cols= ['CUI:ID','id', ':LABEL',  'name']
writer.writerow(cols)
cursor.execute("select * from umls.MRCONSO;")
mrconso= cursor.fetchall()
for line in mrconso: 
    if line[0] in exists_concept: 
        continue
    if line[1] == 'ENG':
        writer.writerow([line[0],line[0], 'Concept', line[-4]])
        exists_concept.add(line[0])
print(f'{len(exists_concept)} concepts')

Process Concepts
3263433 concepts


In [36]:
exists_concept= set()
cursor.execute("select * from umls.MRCONSO;")
mrconso= cursor.fetchall()
for line in mrconso: 
    if line[0] in exists_concept: 
        continue
    if line[1] == 'ENG':
        exists_concept.add(line[0])
print(f'{len(exists_concept)} concepts')

3263433 concepts


## Processing UMLS Atoms

In [7]:
print('Process Atoms')
out= open('MRAUI.processed.csv', 'w', encoding= 'utf-8')
writer= csv.writer(out)
cols= ['AUI:ID', 'id', ':LABEL', 'name', 'CUI']
writer.writerow(cols)
exists_atom= set()

Process Atoms


In [8]:
with open('MRCONSO.RRF', mode= 'r', encoding='utf-8') as f : 
    for line in mrconso: 
        if line[7] in exists_atom: 
            continue
        if line[1] == 'ENG':
            writer.writerow([line[7],line[7], 'Atom', line[-4], line[0]])
            exists_atom.add(line[7])
out.close()
print(f'{len(exists_atom)} atoms')

9356538 atoms


## Processing UMLS Relations

### SN Type to SN Type

#### SN Relations

In [26]:
print('Process UMLS Types')
exists_sn_relations = set()
cursor.execute("select UI, STY_RL, ABR from umls.SRDEF where RT = 'RL';")
mrsrelation = cursor.fetchall()
sn_relation_dict = dict()

Process UMLS Types


In [27]:
for line in mrsrelation: 
    if line[0] in exists_sn_relations: 
        continue
    sn_relation_dict[line[0]] = (line[1], line[2])
    exists_sn_relations.add(line[0])

#### SN Type to Type through relation

In [33]:
print('Process STY relationships')
# exists_types
cursor.execute("select * from umls.SRSTRE1;")
snrel = cursor.fetchall() 

Process STY relationships


In [35]:
out= open('SRSTRE1.processed.csv', 'w', encoding= 'utf-8')
writer= csv.writer(out)
cols= [':START_ID', ':END_ID',':TYPE','RELA']
writer.writerow(cols)
count= 0

for line in snrel:
    start_node= line[0]
    end_node= line[2]
    if start_node in exists_types and end_node in exists_types:
        rel = sn_relation_dict[line[1]]
        writer.writerow([start_node, end_node, rel[1], rel[0]])
    count += 1

out.close()
print(f"{count} relationships")

6217 relationships


### Concept to SN Type

In [41]:
print('Process SN to UMLS Concept relationships')
cursor.execute("select CUI, TUI from umls.MRSTY;")
mrcontype = cursor.fetchall()

Process SN to UMLS Concept relationships


In [42]:
out= open('MRSTY.processed.csv', 'w', encoding= 'utf-8')
writer= csv.writer(out)
cols= [':START_ID', ':END_ID',':TYPE']
writer.writerow(cols)

25

In [43]:
count = 0
for line in mrcontype:
    start_node= line[0]
    end_node= line[1]
    if start_node in exists_concept and end_node in exists_types:
        writer.writerow([start_node, end_node, 'STY'])
    count += 1
out.close()
print(f"{count} relationships")

3595722 relationships


### Concept to Atom

In [9]:
print('Process relationships')

cui_aui = exists_atom | exists_concept
cursor.execute("select * from umls.MRREL;")
mrrel= cursor.fetchall() 

out= open('MRREL.processed.csv', 'w', encoding= 'utf-8')
writer= csv.writer(out)
cols= [':START_ID', ':END_ID',':TYPE','RELA']
writer.writerow(cols)
count= 0
with open('MRREL.RRF', mode= 'r', encoding= 'utf-8') as f: 
    for line in mrrel:
        start_node= line[4]
        end_node= line[0]
        if line[6]== 'AUI': # style 2
            start_node= line[5]
        if line[2]== 'AUI': # syle 1
            end_node= line[1]
        if start_node in cui_aui and end_node in cui_aui: 
            writer.writerow([start_node, c, line[3], line[7]])
        count += 1
out.close()
print(f"{count} relationships")

Process relationships
55685992 relationships
