The Objective of this note book is to integrate the different modelities into one knowledge graph. The modalities are:

- structural: constituted by the triples of the knowledge graph
- uncertain: constituted by the factuality based confidence scores
- temporal: constituted by the time validity of the different triples
- semantic: constituted by the triples' definitions

In [1]:
import pandas as pd
from os.path import join
import json
TRIPLES_DIR = 'all_triples'
CITATIONS_DIR = 'CITATIONS'
TRIP_SENT_DIR = 'TRIPLE_SENTENCES'
SELECTED_TRIPLES = 'Selected Triples'
PROC_UMLS = '/home/pc/Desktop/AdilStuff/Projects/SemRepMed/umls processed net data'

COVID_CUIs = ['C5203670', 'C5203671', 'C5203672', 'C5203673', 'C5203674', 'C5203675', 'C5203676']

In [2]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines

def read_json(file_name):
    with open(file_name, 'r') as f:
        data = json.load(f)
    return data

def get_all_data(fully_complete = True):
    # entity to type mapping
    ent_2_type_map = read_json(join('all_triples', 'all_entity_types.json'))
    # entity to name 
    ent_2_name_map = read_json(join('all_triples', 'all_entity_names.json'))
    ###################### Modalities
    # structural and uncertain modality data
    str_un_mod_df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    # time modality data
    time_mod_df = pd.DataFrame(read_json_lines(join(TRIPLES_DIR, 'origin_time.jsonl')))
    ##### semantic modality data
    # entity semantics
    ent_sem_mod_df = pd.read_csv(join('all_triples', 'entity_description_can.csv'), compression = 'gzip')
    # relation semantics
    rel_sem_mod_df = pd.read_csv(join('all_triples', 'relation_type_description.csv'), compression = 'gzip')
    # entity type semantics
    type_sem_mod_df =pd.read_csv(join('all_triples', 'entity_type_description.csv'), compression = 'gzip')
    #### structural semantics: different relations withe different types
    tran_comp = read_json(join(PROC_UMLS, 'umls_biokg_rel_patterns.json'))
    # transitive relations
    transitive_rels = tran_comp['transitive_rels']
    # composition relations
    composition_rels = tran_comp['composition_rels']
    # relation hierarchy
    rel_heirarchy = pd.read_csv(join(PROC_UMLS, 'rel_heirarchy.csv'))
    # type heirarchy
    ent_type_heirarchy = pd.read_csv(join(PROC_UMLS, 'ent_type_heirarchy.csv'))
    # meta kg
    sem_net = pd.read_csv(join(PROC_UMLS, 'sem_network.csv'))

def completeness_filtering(str_un_mod_df):
    # time modality data
    time_mod_df = pd.DataFrame(read_json_lines(join(TRIPLES_DIR, 'origin_time.jsonl')))
    ##### semantic modality data
    # entity semantics
    ent_sem_mod_df = pd.read_csv(join('all_triples', 'entity_description_can.csv'), compression = 'gzip')
    ent_sem_mod_df = ent_sem_mod_df[~ent_sem_mod_df['DEFINITION'].isna()]
    # relation semantics
    rel_sem_mod_df = pd.read_csv(join('all_triples', 'relation_type_description.csv'), compression = 'gzip')
    # entity type semantics
    type_sem_mod_df =pd.read_csv(join('all_triples', 'entity_type_description.csv'), compression = 'gzip')
    ##### Filtering
    print('original count: ', str_un_mod_df.shape[0])
    str_un_mod_df = str_un_mod_df[str_un_mod_df['SUBJECT_CUI'].isin(ent_sem_mod_df['ENTITY']) & str_un_mod_df['OBJECT_CUI'].isin(ent_sem_mod_df['ENTITY'])]
    print('count leaving only ents with description: ', str_un_mod_df.shape[0])
    str_un_mod_df = str_un_mod_df[str_un_mod_df['ORIGIN_ID'].isin(time_mod_df['ORIGIN_ID'])]
    print('count leaving only triples with time: ', str_un_mod_df.shape[0])
    str_un_mod_df = str_un_mod_df[str_un_mod_df['SUBJECT_TYPE'].isin(type_sem_mod_df['abbreviation']) & str_un_mod_df['OBJECT_TYPE'].isin(type_sem_mod_df['abbreviation'])]
    print('count leaving only types with : ', str_un_mod_df.shape[0])
    str_un_mod_df = str_un_mod_df[str_un_mod_df['PREDICATE'].isin(list(rel_sem_mod_df['name']) + list(rel_sem_mod_df['relation_inverse']))]
    print('count leaving only relations with description: ', str_un_mod_df.shape[0])
    return str_un_mod_df

def get_completed_data():
    # not all entities have a description and not all triples have a date
    # this function extract the triples for the complete entities only
    # entity to type mapping
    ent_2_type_map = read_json(join('all_triples', 'all_entity_types.json'))
    # entity to name 
    ent_2_name_map = read_json(join('all_triples', 'all_entity_names.json'))
    # structural and uncertain modality data
    str_un_mod_df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    str_un_mod_df['SUBJECT_TYPE'] = str_un_mod_df['SUBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df['OBJECT_TYPE'] = str_un_mod_df['OBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df = completeness_filtering(str_un_mod_df.copy())
    return str_un_mod_df

def get_ent_types():
    ent_2_type_map = read_json(join('all_triples', 'all_entity_types.json'))
    # entity to name 
    ent_2_name_map = read_json(join('all_triples', 'all_entity_names.json'))
    # structural and uncertain modality data
    str_un_mod_df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    str_un_mod_df['SUBJECT_TYPE'] = str_un_mod_df['SUBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df['OBJECT_TYPE'] = str_un_mod_df['OBJECT_CUI'].map(ent_2_type_map)
    return str_un_mod_df

def get_sub_schema_triples(ent_types):
    # the objective of this function is the selection of the different
    # 
    ent_2_type_map = read_json(join('all_triples', 'all_entity_types.json'))
    # entity to name 
    ent_2_name_map = read_json(join('all_triples', 'all_entity_names.json'))
    # structural and uncertain modality data
    str_un_mod_df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    str_un_mod_df['SUBJECT_TYPE'] = str_un_mod_df['SUBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df['OBJECT_TYPE'] = str_un_mod_df['OBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df = str_un_mod_df[str_un_mod_df['SUBJECT_TYPE'].isin(ent_types) & str_un_mod_df['OBJECT_TYPE'].isin(ent_types)]
    return str_un_mod_df

def get_specialized_data(intrest_ents):
    # in the case where we have a certain entity that we are intrested in studying
    # we can take the a) entity type b) go to the semantic net and get the meta kg
    # c) onnly take triples whose entity types exist in the meta kg
    # this ensures local completness while reducing the number of triples to train
    # the discarded entities wont have any relation with the targer*t entity anyway
    # entity to type mapping
    ent_2_type_map = read_json(join('all_triples', 'all_entity_types.json'))
    # entity to name 
    ent_2_name_map = read_json(join('all_triples', 'all_entity_names.json'))
    
    intrest_ents_types = list(map(ent_2_type_map.get, intrest_ents))
    sem_net = pd.read_csv(join(PROC_UMLS, 'sem_network.csv'))
    extend_sem_types = sem_net[sem_net['head_type'].isin(intrest_ents_types) | sem_net['tail_type'].isin(intrest_ents_types)]
    extend_sem_types = list(set(list(extend_sem_types['head_type'])+list(extend_sem_types['tail_type'])))
    ### full KG with ent sem types
    str_un_mod_df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')    
    str_un_mod_df['SUBJECT_TYPE'] = str_un_mod_df['SUBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df['OBJECT_TYPE'] = str_un_mod_df['OBJECT_CUI'].map(ent_2_type_map)
    # triples with entities of intrest that should be included
    sub_intrest = str_un_mod_df[str_un_mod_df['SUBJECT_CUI'].isin(intrest_ents) | str_un_mod_df['OBJECT_CUI'].isin(intrest_ents)]
    print('full kg: ', str_un_mod_df.shape[0])
    ### 
    str_un_mod_df = str_un_mod_df[str_un_mod_df['SUBJECT_TYPE'].isin(extend_sem_types) & str_un_mod_df['OBJECT_TYPE'].isin(extend_sem_types)]
    print('target schema complet kg: ', str_un_mod_df.shape[0])
    ###### add the covid specific triples, concat, then  remove duplicates / this is to insure the 
    str_un_mod_df = completeness_filtering(str_un_mod_df.copy())
    str_un_mod_df = pd.concat([str_un_mod_df, sub_intrest]).drop_duplicates()
    return str_un_mod_df
    

## Full Triples

In [23]:
get_completed_data()

original count:  28416917
count leaving only ents with description:  12104432
count leaving only triples with time:  12104432
count leaving only types with :  12104432
count leaving only relations with description:  8269295


Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,SUBJECT_TYPE,OBJECT_TYPE
5,5,CAUSES,C0042776,C0012634,0.92434,virs,dsyn
10,9,PART_OF,C0242210,C0006034,1.00000,aapp,bact
13,12,INTERACTS_WITH,C0004611,C0025646,0.96875,bact,aapp
14,13,AFFECTS,C0031516,C0017262,0.93750,bacs,genf
16,15,PART_OF,C1167395,C0042567,0.99097,orgm,vtbt
...,...,...,...,...,...,...,...
28416904,23857431,AFFECTS,C2985438,C0282577,1.00000,comd,dsyn
28416907,23857433,PROCESS_OF,C1519323,C1333691,1.00000,genf,aapp
28416908,23857433,PROCESS_OF,C1519323,C1333691,1.00000,genf,aapp
28416915,23857436,PROCESS_OF,C4023614,C1413914,1.00000,genf,aapp


In [5]:
covid_selected_df = get_specialized_data(COVID_CUIs)
# 8170408

full kg:  28416917
target schema complet kg:  28048295
original count:  28048295
count leaving only ents with description:  11965927
count leaving only triples with time:  11965927
count leaving only types with :  11965927
count leaving only relations with description:  8170408


In [6]:
covid_selected_df

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,SUBJECT_TYPE,OBJECT_TYPE
5,5,CAUSES,C0042776,C0012634,0.92434,virs,dsyn
10,9,PART_OF,C0242210,C0006034,1.00000,aapp,bact
13,12,INTERACTS_WITH,C0004611,C0025646,0.96875,bact,aapp
14,13,AFFECTS,C0031516,C0017262,0.93750,bacs,genf
16,15,PART_OF,C1167395,C0042567,0.99097,orgm,vtbt
...,...,...,...,...,...,...,...
28415776,23856508,PREDISPOSES,C0026769,C5203670,1.00000,dsyn,dsyn
28415805,23856535,CAUSES,C5203670,C0302158,1.00000,dsyn,patf
28416431,23857062,PREVENTS,C0055361,C5203676,1.00000,phsu,virs
28416685,23857260,COEXISTS_WITH,C1510586,C5203670,1.00000,mobd,dsyn


In [10]:
covid_selected_df[covid_selected_df['SUBJECT_CUI'].isin(COVID_CUIs) | covid_selected_df['OBJECT_CUI'].isin(COVID_CUIs)]

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,SUBJECT_TYPE,OBJECT_TYPE
24628645,20881082,COEXISTS_WITH,C0161479,C5203670,1.000000,inpo,dsyn
24628646,20881083,PROCESS_OF,C5203676,C0030705,0.982900,virs,humn
24628656,20881093,CAUSES,C5203676,C0275518,0.916667,virs,dsyn
24628657,20881094,CAUSES,C0206750,C5203670,0.993223,dsyn,dsyn
24628660,20881097,ASSOCIATED_WITH,C0014695,C5203670,0.775862,phsu,dsyn
...,...,...,...,...,...,...,...
28415776,23856508,PREDISPOSES,C0026769,C5203670,1.000000,dsyn,dsyn
28415805,23856535,CAUSES,C5203670,C0302158,1.000000,dsyn,patf
28416431,23857062,PREVENTS,C0055361,C5203676,1.000000,phsu,virs
28416685,23857260,COEXISTS_WITH,C1510586,C5203670,1.000000,mobd,dsyn


In [11]:
covid_selected_df.to_csv(join(SELECTED_TRIPLES, 'COVID Complet', 'full_kg.csv'), compression = 'gzip', index= False)

## Type Based Study
The objective of this extraction process is the definition of a domain specific schema complet KG.

Risk factor type abbreviations are:
'hlca', 'eehu', 'ocac', 'aggp', 'dora', 'orga', 'socb', 'imft', 'hops', 'famg', 'genf', 'geoa'
(look in 'UMLS Semantic Network Structure' for how we found them based on https://www.researchgate.net/publication/258741187_Comorbidities_Modeling_for_Supporting_Integrated_Care_in_Chronic_Cardiorenal_Disease)

### Select the right types

In [10]:
str_un_mod_df = get_ent_types()

In [11]:
str_un_mod_df

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,SUBJECT_TYPE,OBJECT_TYPE
0,0,PROCESS_OF,C0003725,C0999630,1.000000,gngm,mamm
1,1,ISA,C0039258,C0446169,1.000000,virs,virs
2,2,ISA,C0318627,C0206590,1.000000,virs,virs
3,3,ISA,C0446169,C0003725,1.000000,virs,gngm
4,4,PROCESS_OF,C0012634,C0020114,0.989018,dsyn,grup
...,...,...,...,...,...,...,...
28416912,23857434,COEXISTS_WITH,C1413909,C1413914,1.000000,gngm,aapp
28416913,23857435,PROCESS_OF,C4023614,C1413909,1.000000,genf,gngm
28416914,23857435,PROCESS_OF,C4023614,C1413909,1.000000,genf,gngm
28416915,23857436,PROCESS_OF,C4023614,C1413914,1.000000,genf,aapp


In [15]:
# covid related types
def get_types(str_un_mod_df):
    all_types = []
    all_types += list(str_un_mod_df[str_un_mod_df['SUBJECT_CUI'].isin(COVID_CUIs)]['SUBJECT_TYPE'])
    all_types += list(str_un_mod_df[str_un_mod_df['OBJECT_CUI'].isin(COVID_CUIs)]['OBJECT_TYPE'])
    return list(set(all_types))

In [16]:
get_types(str_un_mod_df)

['aapp', 'dsyn', 'virs', 'fndg', 'topp']

### Extracting the SubKG

In [17]:
ent_types = ['aapp', 'dsyn', 'virs', 'fndg', 'topp', 'hlca', 'eehu', 'ocac', 'aggp', 'dora', 'orga', 'socb', 'imft', 'hops', 'famg', 'genf', 'geoa']

In [19]:
df = get_sub_schema_triples(ent_types)

In [20]:
df.to_csv(join(SELECTED_TRIPLES, 'risk factor covid kg', 'full_kg.csv'), compression = 'gzip', index= False)

## Tests

In [19]:
pd.DataFrame(read_json_lines(join(TRIPLES_DIR, 'origin_time.jsonl')))

Unnamed: 0,ORIGIN_ID,time_min,time_max,true_time
0,0,2006-03-15 00:00:00,2006-03-15 00:00:00,True
1,1,1991-05-01 00:00:00,2006-03-15 00:00:00,True
2,2,2006-03-15 00:00:00,2006-03-15 00:00:00,True
3,3,1988-04-01 00:00:00,2006-03-15 00:00:00,True
4,4,1947-10-01 00:00:00,2017-07-01 00:00:00,True
...,...,...,...,...
23601729,23857432,,,False
23601730,23857433,,,False
23601731,23857434,,,False
23601732,23857435,,,False


In [9]:
# probability component
pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.000000
1,1,ISA,C0039258,C0446169,1.000000
2,2,ISA,C0318627,C0206590,1.000000
3,3,ISA,C0446169,C0003725,1.000000
4,4,PROCESS_OF,C0012634,C0020114,0.989018
...,...,...,...,...,...
28416912,23857434,COEXISTS_WITH,C1413909,C1413914,1.000000
28416913,23857435,PROCESS_OF,C4023614,C1413909,1.000000
28416914,23857435,PROCESS_OF,C4023614,C1413909,1.000000
28416915,23857436,PROCESS_OF,C4023614,C1413914,1.000000


In [10]:
# entity description

ent_defs = pd.read_csv(join('all_triples', 'entity_description_can.csv'), compression = 'gzip')

In [27]:
ent_defs[ent_defs['ENTITY'].isin(COVID_CUIs)]

Unnamed: 0,ENTITY,DEFINITION,ALIASES,NAME
358897,C5203676,,Wuhan coronavirus|2019-nCoV|2019 novel coronav...,SARS-CoV-2
358902,C5203670,,Disease caused by 2019-nCoV|Disease caused by ...,COVID-19
359004,C5203671,,Suspected disease caused by 2019-nCoV|Suspecte...,Suspected COVID-19
430308,C5203672,,2019 novel coronavirus vaccination|2019-nCoV v...,SARS-CoV-2 vaccination
447893,C5203674,,Antibody to 2019 novel coronavirus|Antibody to...,Antibody to SARS-CoV-2
471009,C5203674,,Antibody to 2019 novel coronavirus|Antibody to...,Antibody to SARS-CoV-2


In [13]:
ent_defs[~ent_defs['DEFINITION'].isna()]

Unnamed: 0,ENTITY,DEFINITION,ALIASES,NAME
0,C0003725,Arthropod-borne viruses. A non-taxonomic desig...,Arbovirus (navigational concept)|arbovirus|Arb...,Arboviruses
3,C0012634,A definite pathologic process with a character...,Disease|Clinical disease or syndrome|Clinical ...,Disease
4,C0042776,Minute infectious agents whose genomes are com...,"Virus|Virus, NOS|Viruses, General|Virus (organ...",Virus
6,C0242210,General term for proteins that have binding as...,Ligand Binding Protein|Binding Protein|binding...,Binding Proteins
9,C0242210,General term for proteins that have binding as...,Ligand Binding Protein|Binding Protein|binding...,Binding Proteins
...,...,...,...,...
474849,C1328042,An epithelial neoplasm arising from the thymus...,Plump Cell Thymoma|type B thymoma|epithelioid ...,Thymoma Type B
474854,C1149654,Catalysis of the reaction: dUTP + H2O = dUMP +...,dUTPase activity|dUTP pyrophosphatase activity...,dUTP pyrophosphatase activity
474861,C1327041,The chemical reactions and pathways resulting ...,mycothiol formation|mycothiol biosynthesis|myc...,mycothiol biosynthetic process
474862,C4048299,"Tissue factor (295 aa, ~33 kDa) is encoded by ...",Thromboplastin|F3 Protein|CD142 Antigen|F3|TF|...,Tissue Factor


In [30]:
read_json(join('all_triples', 'all_entity_types.json'))

{'C0999630': 'mamm',
 'C0446169': 'virs',
 'C0206590': 'virs',
 'C0003725': 'gngm',
 'C0020114': 'grup',
 'C0012634': 'dsyn',
 'C0003241': 'aapp',
 'C0002273': 'gngm',
 'C0006034': 'bact',
 'C0019878': 'aapp',
 'C0025646': 'aapp',
 'C0017262': 'genf',
 'C0001675': 'humn',
 'C0042567': 'vtbt',
 'C1167395': 'orgm',
 'C0003062': 'anim',
 'C0162318': 'anim',
 'C1136254': 'phsu',
 'C0949665': 'orch',
 'C0013227': 'phsu',
 'C0014061': 'dsyn',
 'C0030705': 'humn',
 'C0086287': 'orga',
 'C0086582': 'orga',
 'C1552130': 'bpoc',
 'C0237401': 'humn',
 'C1457887': 'sosy',
 'C0021311': 'dsyn',
 'C0089701': 'orch',
 'C0012854': 'bacs',
 'C0456981': 'imft',
 'C0021368': 'patf',
 'C0004368': 'patf',
 'C0039194': 'cell',
 'C0003315': 'gngm',
 'C0670896': 'aapp',
 'C1185625': 'bsoj',
 'C0597357': 'aapp',
 'C0035549': 'orch',
 'C0162638': 'celf',
 'C0998896': 'fish',
 'C0332124': 'fndg',
 'C0015780': 'orga',
 'C0998689': 'euka',
 'C0017199': 'emst',
 'C0162415': 'emst',
 'C0181909': 'medd',
 'C0005539': 

In [16]:
read_json(join('all_triples', 'all_entity_names.json'))

{'C0999630': 'Lepus capensis (organism)',
 'C0446169': 'California Group Viruses',
 'C0206590': 'Coltivirus',
 'C0003725': 'JUN',
 'C0020114': 'Human',
 'C0012634': 'Disease',
 'C0003241': 'Antibodies',
 'C0002273': 'FHL1',
 'C0006034': 'Borrelia burgdorferi',
 'C0019878': 'homocysteine',
 'C0025646': 'methionine',
 'C0017262': 'Gene Expression',
 'C0001675': 'Adult',
 'C0042567': 'Vertebrates',
 'C1167395': 'Host (organism)',
 'C0003062': 'Animals',
 'C0162318': 'Farm Animals',
 'C1136254': 'Microbicides',
 'C0949665': 'Fluoroquinolones',
 'C0013227': 'Pharmaceutical Preparations',
 'C0014061': 'Tick-Borne Encephalitis',
 'C0030705': 'Patients',
 'C0086287': 'Females',
 'C0086582': 'Males',
 'C1552130': '16s',
 'C0237401': 'Individual',
 'C1457887': 'Symptoms',
 'C0021311': 'Infection',
 'C0089701': '1-oleoyl-2-stearoylphosphatidylcholine',
 'C0012854': 'DNA',
 'C0456981': 'Specific antigen',
 'C0021368': 'Inflammation',
 'C0004368': 'Autoimmunity',
 'C0039194': 'T-Lymphocyte',
 'C000

In [17]:
pd.read_csv(join('all_triples', 'relation_type_description.csv'), compression = 'gzip')

Unnamed: 0,name,definition,usage_note,relation_inverse,abbreviation
0,PHYSICALLY_RELATED_TO,Related by virtue of some physical attribute o...,,PHYSICALLY_RELATED_TO,PR
1,PART_OF,"Composes, with one or more other physical unit...",,HAS_PART,PT
2,CONTAINS,Holds or is the receptacle for fluids or other...,,CONTAINED_IN,CT
3,LOCATION_OF,"The position, site, or region of an entity or ...",,HAS_LOCATION,LO
4,TEMPORALLY_RELATED_TO,"Related in time by preceding, co-occuring with...",,TEMPORALLY_RELATED_TO,TR
5,CO-OCCURS_WITH,"Occurs at the same time as, together with, or ...",,CO-OCCURS_WITH,CW
6,PRECEDES,Occurs earlier in time. This includes antedate...,,FOLLOWS,PC
7,FUNCTIONALLY_RELATED_TO,Related by the carrying out of some function o...,,FUNCTIONALLY_RELATED_TO,FR
8,PROCESS_OF,"Action, function, or state of.",,HAS_PROCESS,PO
9,CARRIES_OUT,Executes a function or performs a procedure or...,,CARRIED_OUT_BY,CO


In [3]:
entity_type_desc = pd.read_csv(join('all_triples', 'entity_type_description.csv'), compression = 'gzip')

In [4]:
entity_type_desc

Unnamed: 0,name,definition,usage_note,abbreviation
0,Organism,"Generally, a living individual, including all ...",,orgm
1,Plant,"An organism having cellulose cell walls, growi...",,plnt
2,Fungus,A eukaryotic organism characterized by the abs...,,fngs
3,Virus,An organism consisting of a core of a single n...,,virs
4,Bacterium,"A small, typically one-celled, prokaryotic mic...",,bact
...,...,...,...,...
131,Lipid,An inclusive group of fat or fat-derived subst...,Phospholipids should not also be typed with 'O...,lipd
132,Neuroreactive Substance or Biogenic Amine,An endogenous substance whose activities affec...,Unlike many of the other types under 'Biologic...,nsba
133,Alga,A chiefly aquatic plant that contains chloroph...,,alga
134,Rickettsia or Chlamydia,An organism intermediate in size and complexit...,,rich


In [15]:
entity_type_desc[entity_type_desc['name'].str.contains('Virus')]

Unnamed: 0,name,definition,usage_note,abbreviation
3,Virus,An organism consisting of a core of a single n...,,virs


In [28]:
pd.read_csv(join(PROC_UMLS, 'sem_network.csv'))

Unnamed: 0,head_type,rel,tail_type
0,acab,affects,amph
1,acab,affects,anim
2,acab,affects,arch
3,acab,affects,bact
4,acab,affects,bird
...,...,...,...
6100,vita,isa,enty
6101,vita,isa,phob
6102,vita,isa,sbst
6103,vita,issue_in,bmod
