In [1]:
import pandas as pd
from os.path import join
from glob import glob
from tqdm.notebook import tqdm
import gc

def collect_garbage():
    print(gc.get_count())
    gc.collect()
    print(gc.get_count())
# my_dtypes = OrderedDict([("time",int), ("cik",int)])
PREDICATION_COLS = [
    'PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 
    'SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
    'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY', 'Nan1', 'Nan2', 'Nan3']
TRIPLE_DATA = ['PREDICATE','SUBJECT_CUI','SUBJECT_NAME',
               'SUBJECT_SEMTYPE','OBJECT_CUI','OBJECT_NAME','OBJECT_SEMTYPE']
PREDICATION_AUX_COLS = ['PREDICATION_AUX_ID' ,'PREDICATION_ID' ,'SUBJECT_TEXT','SUBJECT_DIST'
,'SUBJECT_MAXDIST','SUBJECT_START_INDEX'
,'SUBJECT_END_INDEX','SUBJECT_SCORE'
,'INDICATOR_TYPE','PREDICATE_START_INDEX'
,'PREDICATE_END_INDEX','OBJECT_TEXT','OBJECT_DIST'
,'OBJECT_MAXDIST','OBJECT_START_INDEX'
,'OBJECT_END_INDEX','OBJECT_SCORE'
,'CURR_TIMESTAMP']

PREDICATION_DIR = 'PREDICATION'
PREDICATION_AUX_DIR = 'PREDICATION_AUX'
TRIPLES_DIR = 'all_triples'

In [2]:
collect_garbage()

(87, 8, 7)
(2, 0, 0)


In [3]:
## solve the cui canonization problem
## create link between canonized triples and original triples
## create link between original triples and predications


In [4]:
def canonize_cui(cui):
    cacon_form = 'C0000000'
    if len(cui)<8:
        cui = cacon_form[0:-len(cui)]+cui
    elif 'C' not in  cui:
        cui = 'C' + cui
    return cui

def process_mult_cui(cui):
    origin_cui = None
    all_cuis = []
    for cu in cui.split('|'):
        if origin_cui is None:
            cu = canonize_cui(cu)
            origin_cui = cu
        else:
#             cu = origin_cui[0:len(origin_cui)-len(cu)] + cu
            cu = origin_cui[0:-len(cu)] + cu
        all_cuis.append(cu)
    return all_cuis

def process_cui(cui):
    if '|' in cui:
        cuis = process_mult_cui(cui)
    else:
        cuis = [canonize_cui(cui)]
    return cuis

In [5]:
def get_all_triples():
    all_files = sorted(glob(join(PREDICATION_DIR,'*.gz')))
    triple_df = None
    entity_df = None
    for f_name in tqdm(all_files):
        print('--------')
        df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]
        print('origin df .shape: ',df.shape)
        sub_data = df[['SUBJECT_CUI','SUBJECT_SEMTYPE', 'SUBJECT_NAME']].rename(columns ={'SUBJECT_CUI':'ENTITY','SUBJECT_SEMTYPE':'SEMTYPE', 'SUBJECT_NAME':'NAME'})
        obj_data = df[['OBJECT_CUI','OBJECT_SEMTYPE', 'OBJECT_NAME']].rename(columns ={'OBJECT_CUI':'ENTITY','OBJECT_SEMTYPE': 'SEMTYPE', 'OBJECT_NAME':'NAME'})
#         print(sub_data)
        
#         print('obj_data .shape: ',obj_data.shape)
#         print('sub_data .shape: ',sub_data.shape)
        ent_data = pd.concat([sub_data, obj_data], axis=0, ignore_index=True)
#         print(ent_data)
        ent_data.drop_duplicates(inplace=True, ignore_index=True)
        df.drop(['SUBJECT_NAME','SUBJECT_SEMTYPE','OBJECT_NAME','OBJECT_SEMTYPE'], inplace=True, axis=1)
        df.drop_duplicates(inplace=True, ignore_index=True)
#         print('ent_data.shape: ',ent_data.shape)
#         print('dropped df .shape: ',df.shape)
        
        if entity_df is None:
            entity_df = ent_data
        else:
            entity_df = pd.concat([entity_df, ent_data], axis=0, ignore_index=True)
            entity_df.drop_duplicates(inplace=True, ignore_index=True)
#         if len(list(entity_df.columns))>2:
#             print(ent_data)
        print('entity_df.shape: ',entity_df.shape)
        if triple_df is None:
            triple_df = df
        else:
            triple_df = pd.concat([triple_df, df], axis=0, ignore_index=True)
            triple_df.drop_duplicates(inplace=True, ignore_index=True)
        print('triple_df.shape: ',triple_df.shape)
        

    return triple_df, entity_df

def get_processed_triples_entities():
    pass

In [44]:
all_triples = get_all_triples()

  0%|          | 0/30 [00:00<?, ?it/s]

--------
origin df .shape:  (3961440, 7)
entity_df.shape:  (146933, 3)
triple_df.shape:  (1821307, 3)
--------
origin df .shape:  (3960366, 7)
entity_df.shape:  (186045, 3)
triple_df.shape:  (3183989, 3)
--------
origin df .shape:  (4017120, 7)
entity_df.shape:  (219440, 3)
triple_df.shape:  (4469396, 3)
--------
origin df .shape:  (3980676, 7)
entity_df.shape:  (240660, 3)
triple_df.shape:  (5600807, 3)
--------
origin df .shape:  (4025150, 7)
entity_df.shape:  (257469, 3)
triple_df.shape:  (6606451, 3)
--------
origin df .shape:  (4003037, 7)
entity_df.shape:  (268861, 3)
triple_df.shape:  (7495245, 3)
--------
origin df .shape:  (4006140, 7)
entity_df.shape:  (280323, 3)
triple_df.shape:  (8355630, 3)
--------


  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]


origin df .shape:  (3960045, 7)
entity_df.shape:  (291283, 3)
triple_df.shape:  (9201461, 3)
--------
origin df .shape:  (3936238, 7)
entity_df.shape:  (302069, 3)
triple_df.shape:  (10043151, 3)
--------
origin df .shape:  (3924723, 7)
entity_df.shape:  (312393, 3)
triple_df.shape:  (10877144, 3)
--------
origin df .shape:  (3904348, 7)
entity_df.shape:  (322820, 3)
triple_df.shape:  (11717442, 3)
--------
origin df .shape:  (3887322, 7)
entity_df.shape:  (331832, 3)
triple_df.shape:  (12560040, 3)
--------


  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]


origin df .shape:  (3884106, 7)
entity_df.shape:  (339499, 3)
triple_df.shape:  (13364359, 3)
--------
origin df .shape:  (3893402, 7)
entity_df.shape:  (346304, 3)
triple_df.shape:  (14160112, 3)
--------


  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]


origin df .shape:  (3893750, 7)
entity_df.shape:  (352665, 3)
triple_df.shape:  (14932245, 3)
--------
origin df .shape:  (3899488, 7)
entity_df.shape:  (359110, 3)
triple_df.shape:  (15700365, 3)
--------
origin df .shape:  (3898411, 7)
entity_df.shape:  (364658, 3)
triple_df.shape:  (16437067, 3)
--------
origin df .shape:  (3895593, 7)
entity_df.shape:  (369177, 3)
triple_df.shape:  (17106724, 3)
--------
origin df .shape:  (3884880, 7)
entity_df.shape:  (371803, 3)
triple_df.shape:  (17585202, 3)
--------


  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]


origin df .shape:  (3885454, 7)
entity_df.shape:  (374113, 3)
triple_df.shape:  (18046682, 3)
--------


  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]


origin df .shape:  (3880862, 7)
entity_df.shape:  (376224, 3)
triple_df.shape:  (18503326, 3)
--------
origin df .shape:  (3886032, 7)
entity_df.shape:  (378343, 3)
triple_df.shape:  (18937020, 3)
--------
origin df .shape:  (3879792, 7)
entity_df.shape:  (380170, 3)
triple_df.shape:  (19363363, 3)
--------
origin df .shape:  (3882586, 7)
entity_df.shape:  (381928, 3)
triple_df.shape:  (19776923, 3)
--------
origin df .shape:  (3882479, 7)
entity_df.shape:  (383463, 3)
triple_df.shape:  (20142419, 3)
--------


  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)[TRIPLE_DATA]


origin df .shape:  (3906598, 7)
entity_df.shape:  (387108, 3)
triple_df.shape:  (20721429, 3)
--------
origin df .shape:  (3840274, 7)
entity_df.shape:  (450094, 3)
triple_df.shape:  (21628834, 3)
--------
origin df .shape:  (3815169, 7)
entity_df.shape:  (478487, 3)
triple_df.shape:  (22534205, 3)
--------
origin df .shape:  (3817231, 7)
entity_df.shape:  (498328, 3)
triple_df.shape:  (23423145, 3)
--------
origin df .shape:  (2033207, 7)
entity_df.shape:  (507384, 3)
triple_df.shape:  (23857437, 3)


In [45]:
all_triples[0]

Unnamed: 0,PREDICATE,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114
...,...,...,...
23857432,PROCESS_OF,C0599155,C1416777|3909
23857433,PROCESS_OF,C1519323,C1334147|3691
23857434,COEXISTS_WITH,C1416777|3909,C1416783|3914
23857435,PROCESS_OF,C4023614,C1416777|3909


In [50]:
pd.read_csv(join(TRIPLES_DIR, 'all_data_triples.csv'), compression = 'gzip')

Unnamed: 0,PREDICATE,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,C0003725,C0999630
1,ISA,C0039258,C0446169
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114
...,...,...,...
23857432,PROCESS_OF,C0599155,C1416777|3909
23857433,PROCESS_OF,C1519323,C1334147|3691
23857434,COEXISTS_WITH,C1416777|3909,C1416783|3914
23857435,PROCESS_OF,C4023614,C1416777|3909


In [47]:
all_triples[0].to_csv(join(TRIPLES_DIR, 'all_data_triples.csv'), compression = 'gzip', index = False)

In [48]:
all_triples[1].to_csv(join(TRIPLES_DIR, 'all_data_entities.csv'), compression = 'gzip', index = False)

In [46]:
all_triples[1]

Unnamed: 0,ENTITY,SEMTYPE,NAME
0,C0003725,virs,Arboviruses
1,C0039258,virs,Tahyna virus
2,C0318627,virs,Eyach virus
3,C0446169,virs,California Group Viruses
4,C0012634,dsyn,Disease
...,...,...,...
507379,C0597918,popg,Filipino Americans
507380,C0022038,irda,ioxaglate
507381,C1289860,mnob,Flashlamp-pumped pulsed dye laser device
507382,C1866020,fndg,Centralized myonuclei


## Triple _ Predication map

In [6]:
def get_trip_2_id(trip_df):
    PREDICATE = list(trip_df['PREDICATE'])
    SUBJECT_CUI = list(trip_df['SUBJECT_CUI'])
    OBJECT_CUI = list(trip_df['OBJECT_CUI'])
    index = list(trip_df.index)
    return {(p, s, o):i for p, s, o, i in zip(PREDICATE, SUBJECT_CUI, OBJECT_CUI, index)}

In [7]:
def get_triple_pred_map(trip_df):
    trip_2_id = get_trip_2_id(trip_df)
    all_files = sorted(glob(join(PREDICATION_DIR,'*.gz')))
    all_dfs = []
    
    for f_name in tqdm(all_files):
#         print('--------')
#         df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS, engine='pyarrow')
        df = pd.read_csv(
            f_name, compression= 'gzip', on_bad_lines=test_line, encoding='ISO-8859-1',
            header=None, names = PREDICATION_COLS, engine='python'
        )
        df = df.groupby(['PREDICATE','SUBJECT_CUI','OBJECT_CUI'])[['PREDICATION_ID', 'SENTENCE_ID']].agg(lambda x: list(x))
        df["ORIGIN_ID"] = [trip_2_id[a] for a in list(df.index)]
        df = df[['ORIGIN_ID', 'PREDICATION_ID', 'SENTENCE_ID']]
        all_dfs.append(df)
    all_dfs = pd.concat(all_dfs, ignore_index = True)
#     all_dfs[['ORIGIN_ID', 'SENTENCE_ID']].explode('SENTENCE_ID')
#     all_dfs[['ORIGIN_ID', 'PREDICATION_ID']].explode('PREDICATION_ID')
#     all_dfs = all_dfs.drop_duplicates(ignore_index = True)
    return all_dfs

def get_triple_pred_aux_map(trip_df):
    trip_2_id = get_trip_2_id(trip_df)
    all_files = sorted(glob(join(PREDICATION_AUX_DIR,'*.gz')))
    all_dfs = []
    
    for f_name in tqdm(all_files):
#         print('--------')
        df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_AUX_COLS, engine='pyarrow')
#         return df
        df = df.groupby(['PREDICATE','SUBJECT_CUI','OBJECT_CUI'])[['PREDICATION_AUX_ID', 'PREDICATION_ID']].agg(lambda x: list(x))
        df["ORIGIN_ID"] = [trip_2_id[a] for a in list(df.index)]
        df = df[['ORIGIN_ID', 'PREDICATION_ID', 'SENTENCE_ID']]
        all_dfs.append(df)
    all_dfs = pd.concat(all_dfs, ignore_index = True)
    return all_dfs
def test_line(bad_line):
    print(bad_line)
    return bad_line

def get_pred_aux_pred_map():
    all_files = sorted(glob(join(PREDICATION_AUX_DIR,'*.gz')))
    all_dfs = []
    
    for f_name in tqdm(all_files):
        df = pd.read_csv(
            f_name, compression= 'gzip', on_bad_lines=test_line, encoding='ISO-8859-1',
            header=None, names = PREDICATION_AUX_COLS, engine='python'
        )
#         df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_AUX_COLS, engine='pyarrow')
        all_dfs.append(df[['PREDICATION_AUX_ID', 'PREDICATION_ID']])
    all_dfs = pd.concat(all_dfs, ignore_index = True)
    return all_dfs

def get_triple_pred_map_df(res_dict):
    all_data = []
    for ORIGIN_ID in res_dict:
        data_ = {}
        data_['ORIGIN_ID'] = ORIGIN_ID
        for PREDICATION_ID in res_dict[ORIGIN_ID]:
            data_['PREDICATION_ID'] = PREDICATION_ID
            all_data.append(data_)
    return pd.DataFrame(all_data)
        

In [11]:
all_triples = pd.read_csv(join(TRIPLES_DIR, 'all_data_triples.csv'), compression = 'gzip')
all_entities = pd.read_csv(join(TRIPLES_DIR, 'all_data_entities.csv'), compression = 'gzip')

In [12]:
# xx = get_trip_2_id(all_triples)
del all_triples
del all_entities

In [30]:
pred_aux_pred_map = get_pred_aux_pred_map()

  0%|          | 0/810 [00:00<?, ?it/s]

In [39]:
pred_aux_pred_map.to_csv(join('index_maps', 'pred_aux_pred_map.csv'), index = False, compression = 'gzip')

In [36]:
pred_aux_pred_map[pred_aux_pred_map['PREDICATION_AUX_ID']== 92150804]

Unnamed: 0,PREDICATION_AUX_ID,PREDICATION_ID
18433956,92150804,92150816


In [37]:
#del xx
# id_2_pred_aux = get_triple_pred_aux_map(all_triples)

In [20]:
id_2_pred_aux

Unnamed: 0,PREDICATION_AUX_ID,PREDICATION_ID,SUBJECT_TEXT,SUBJECT_DIST,SUBJECT_MAXDIST,SUBJECT_START_INDEX,SUBJECT_END_INDEX,SUBJECT_SCORE,INDICATOR_TYPE,PREDICATE_START_INDEX,PREDICATE_END_INDEX,OBJECT_TEXT,OBJECT_DIST,OBJECT_MAXDIST,OBJECT_START_INDEX,OBJECT_END_INDEX,OBJECT_SCORE,CURR_TIMESTAMP
0,10592600,10592604,arboviruses,1,3,69,80,840,PREP,81,83,brown hares,1,3,93,104,884,2019-03-26 15:15:35
1,10592679,10592697,Tahyna virus,0,0,232,244,1000,SPEC,232,279,California encephalitis serogroup,0,0,246,279,901,2019-03-26 15:15:35
2,10592713,10592728,Eyach virus,0,0,196,207,1000,SPEC,196,225,genus Coltivirus,0,0,209,225,1000,2019-03-26 15:15:35
3,10592749,10592759,California encephalitis serogroup,0,0,246,279,901,SPEC,246,326,arthropod-borne viruses,0,0,303,326,1000,2019-03-26 15:15:35
4,10592816,10592832,disease,0,0,402,409,888,MOD/HEAD,396,409,human,0,0,396,401,888,2019-03-26 15:15:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144635,28627549,28627577,adrenal system,1,3,71,85,901,PREP,86,88,patients,1,3,89,97,1000,2019-03-27 09:58:55
144636,28627714,28627733,Xanthurenic aciduria,2,2,21,41,1000,PREP,61,63,poisoning,1,1,80,89,861,2019-03-27 09:58:55
144637,28627839,28627864,vaginal,1,2,69,76,1000,PREP,66,68,complication,1,2,53,65,888,2019-03-27 09:58:55
144638,28627945,28627953,uterine cervix cancer,1,1,68,89,1000,PREP,63,67,patients,1,3,54,62,1000,2019-03-27 09:58:55


In [10]:
id_2_pred = get_triple_pred_map(all_triples)

  0%|          | 0/30 [00:00<?, ?it/s]

In [11]:
id_2_pred

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID
0,1775772,[76299252],[10378470]
1,1302567,[74851732],[24206336]
2,1737760,[76177799],[26954710]
3,380736,[72444077],[19247628]
4,29711,[15350637],[84375]
...,...,...,...
48206635,23529953,[195830370],[372609101]
48206636,23789666,[197163810],[375763418]
48206637,23425206,"[195239203, 195239206]","[371250407, 371250413]"
48206638,23841294,"[197412490, 197412498]","[376355376, 376355386]"


In [12]:
id_2_pred = id_2_pred.set_index(['ORIGIN_ID']).apply(pd.Series.explode).reset_index()

In [13]:
id_2_pred

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID
0,1775772,76299252,10378470
1,1302567,74851732,24206336
2,1737760,76177799,26954710
3,380736,72444077,19247628
4,29711,15350637,84375
...,...,...,...
115523810,23425206,195239203,371250407
115523811,23425206,195239206,371250413
115523812,23841294,197412490,376355376
115523813,23841294,197412498,376355386


In [14]:
id_2_pred.to_csv(join('index_maps','origin_predication.csv'), compression = 'gzip', index = False)

In [17]:
del id_2_pred

## Origin Pred AUX MAP

In [15]:
def get_pred_pred_aux_origin_sent_map():
    ori_pred = pd.read_csv(join('index_maps','origin_predication.csv'), compression = 'gzip')
    pred_aux_pred = pd.read_csv(join('index_maps', 'pred_aux_pred_map.csv'), compression = 'gzip')
    return ori_pred.merge(pred_aux_pred,how='inner', left_on='PREDICATION_ID', right_on='PREDICATION_ID')

#     pred_aux_pred = dict(zip(pred_aux_pred['PREDICATION_ID'], pred_aux_pred['PREDICATION_AUX_ID']))
#     ori_pred['PREDICATION_AUX_ID'] = ori_pred['PREDICATION_ID'].map(pred_aux_pred)
#     return ori_pred


In [16]:
all_map = get_pred_pred_aux_origin_sent_map()

In [17]:
all_map

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID,PREDICATION_AUX_ID
0,1775772,76299252,10378470,76299234
1,1302567,74851732,24206336,74851718
2,1737760,76177799,26954710,76177784
3,380736,72444077,19247628,72444062
4,29711,15350637,84375,15350618
...,...,...,...,...
115511865,23425206,195239203,371250407,195244650
115511866,23425206,195239206,371250413,195244653
115511867,23841294,197412490,376355376,197418519
115511868,23841294,197412498,376355386,197418527


In [20]:
all_map[all_map['PREDICATION_AUX_ID']==92150156]

Unnamed: 0,ORIGIN_ID,PREDICATION_ID,SENTENCE_ID,PREDICATION_AUX_ID
19876960,3377456,92150173,6914997,92150156


In [22]:
all_map.set_index('PREDICATION_AUX_ID').loc[92150156, :]

ORIGIN_ID          3377456
PREDICATION_ID    92150173
SENTENCE_ID        6914997
Name: 92150156, dtype: int64

In [19]:
all_map.to_csv(join('index_maps','origin_pred_pred_aux_sent_map.csv'), compression = 'gzip', index = False)

## Canonizing CUIs

In [47]:
def canonize_cui(cui):
    cacon_form = 'C0000000'
    if len(cui)<8:
        cui = cacon_form[0:-len(cui)]+cui
    elif 'C' not in  cui:
        cui = 'C' + cui
    return cui

def process_mult_cui(cui):
    origin_cui = None
    all_cuis = []
    for cu in cui.split('|'):
        if origin_cui is None:
            cu = canonize_cui(cu)
            origin_cui = cu
        else:
#             cu = origin_cui[0:len(origin_cui)-len(cu)] + cu
            if len(origin_cui)>=8:
                cu = origin_cui[0:-len(cu)] + cu
            else:
                cu = canonize_cui(cu)
                origin_cui = cu
        if len(origin_cui)>=8:
            all_cuis.append(cu)
    return all_cuis

def process_cui(cui):
    if '|' in cui:
        cuis = process_mult_cui(cui)
    else:
        cuis = [canonize_cui(cui)]
    return cuis

def cononize_entities(ent_df):
    all_data = []
    for i, rec in ent_df.iterrows():
        ent_data = {}
        ENTITY = rec['ENTITY']
        ent_data['SEMTYPE'] = rec['SEMTYPE']
        ent_data['NAME']  = rec['NAME']
        ent_data['ORIGIN_ID'] = i
        cuis = process_cui(ENTITY)
        for cui in cuis:
            ent_data['ENTITY'] = cui
            all_data.append(ent_data)
    return pd.DataFrame(all_data)

def canonize_triples(trip_df):
    all_data = []
    for i, rec in trip_df.iterrows():
        trip_data = {}
        trip_data['PREDICATE'] = rec['PREDICATE']
        trip_data['ORIGIN_ID'] = i
        
        SUBJECT_CUI = rec['SUBJECT_CUI']
        OBJECT_CUI = rec['OBJECT_CUI']
        
        sub_cuis = process_cui(SUBJECT_CUI)
        obj_cuis = process_cui(OBJECT_CUI)
        for sub_cui in sub_cuis:
            for obj_cui in obj_cuis:
                trip_data['SUBJECT_CUI'] = sub_cui
                trip_data['OBJECT_CUI'] = obj_cui
                all_data.append(trip_data)
    return pd.DataFrame(all_data)

In [6]:
all_triples = pd.read_csv(join(TRIPLES_DIR, 'all_data_triples.csv'), compression = 'gzip')
all_entities = pd.read_csv(join(TRIPLES_DIR, 'all_data_entities.csv'), compression = 'gzip')

In [53]:
can = cononize_entities(all_entities)

In [54]:
#all_entities.shape

In [55]:
#can

In [56]:
#all_entities[all_entities['NAME']=='1|podg|humn']

In [57]:
#all_entities[all_entities['NAME']=='1|podg|humn']

In [58]:
#can[can['ENTITY']=='Patients']

In [59]:
#all_entities[all_entities['ENTITY']=='Patients']

In [60]:
#{i for i in list(can['ENTITY'])if 'C' not in i and len(i)<=8}

In [61]:
#{i for i in list(can['ENTITY'])if i==''}

In [62]:
#process_mult_cui('10128')

In [63]:
#{i for i in list(all_entities['ENTITY'])if '10128' in i}

In [64]:
#{i for i in list(can['ENTITY'])if '|' in i}

In [65]:
#canonize_cui('10128')

In [66]:
can.to_csv(join(TRIPLES_DIR, 'all_data_entities_can.csv'), compression = 'gzip', index = False)

In [67]:
can = canonize_triples(all_triples)

In [68]:
can

Unnamed: 0,PREDICATE,ORIGIN_ID,SUBJECT_CUI,OBJECT_CUI
0,PROCESS_OF,0,C0003725,C0999630
1,ISA,1,C0039258,C0446169
2,ISA,2,C0318627,C0206590
3,ISA,3,C0446169,C0003725
4,PROCESS_OF,4,C0012634,C0020114
...,...,...,...,...
28720661,COEXISTS_WITH,23857434,C1413909,C1413914
28720662,PROCESS_OF,23857435,C4023614,C1413909
28720663,PROCESS_OF,23857435,C4023614,C1413909
28720664,PROCESS_OF,23857436,C4023614,C1413914


In [69]:
#process_mult_cui('10128')

In [70]:
{i for i in list(can['SUBJECT_CUI'])if 'C' not in i and len(i)<=8}

set()

In [72]:
{i for i in list(can['SUBJECT_CUI'])if '|' in i}

set()

In [73]:
can.to_csv(join(TRIPLES_DIR, 'all_data_triples_can.csv'), compression = 'gzip', index = False)

In [74]:
del can

## Tests

In [12]:

df = pd.read_csv('PREDICATION/split_07.csv.gz',  compression= 'gzip', sep = ',', engine="pyarrow",header=None, names = PREDICATION_COLS)
%time

ArrowInvalid: CSV parse error: Expected 15 columns, got 14: "102593865","76990453","9860854","MEASURES","C0006779","Calorimetry","lbpr","1","C0450304","0.01 ...

In [22]:
df = pd.read_csv('PREDICATION/split_26.csv.gz', compression= 'gzip', encoding='iso-8859-1', sep = ',',header=None, names = PREDICATION_COLS)
%time

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 11 µs


In [19]:
df

Unnamed: 0,PREDICATION_ID,SENTENCE_ID,PMID,PREDICATE,SUBJECT_CUI,SUBJECT_NAME,SUBJECT_SEMTYPE,SUBJECT_NOVELTY,OBJECT_CUI,OBJECT_NAME,OBJECT_SEMTYPE,OBJECT_NOVELTY,Nan1,Nan2,Nan3
0,182425281,342480916,31592797,DIAGNOSES,C0024671,Mammography,diap,1,C0006142,Malignant neoplasm of breast,neop,1,\N,\N,\N
1,182425282,342480916,31592797,ADMINISTERED_TO,C0040395,tomography,diap,1,C0043210,Woman,humn,1,\N,\N,\N
2,182425283,342480920,31592797,USES,C0040395,tomography,diap,1,C1441526,COMPUTED,lbpr,0,\N,\N,\N
3,182425284,342480921,31592797,ADMINISTERED_TO,C0041618,Ultrasonography,diap,1,C0030705,Patients,humn,0,\N,\N,\N
4,182425285,342480926,31592797,USES,C0040395,tomography,diap,1,C1441526,COMPUTED,lbpr,0,\N,\N,\N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3840269,186485441,351506576,32439845,LOCATION_OF,C2331062,Orbitofrontal cortex,blor,1,C4019080,Dorsolateral Prefrontal Cortex,bpoc,1,\N,\N,\N
3840270,186485442,351506576,32439845,PROCESS_OF,C0582591,Processing speed,menp,1,C0027361,Persons,humn,0,\N,\N,\N
3840271,186485443,351506581,32439846,TREATS,C3166216,Prescribed medications,phsu,1,C0525045,Mood Disorders,mobd,1,\N,\N,\N
3840272,186485444,351506586,32439846,INTERACTS_WITH,C0085828,Transcription Factor AP-1,aapp,1,C0003289,Antidepressive Agents,phsu,1,\N,\N,\N


In [6]:
pd.io.parquet.get_engine('auto')

<pandas.io.parquet.PyArrowImpl at 0x7f41d5a898e0>

In [3]:
pd.io.parquet.PyArrowImpl()

<pandas.io.parquet.PyArrowImpl at 0x7f41d5ac1970>

In [2]:
pd.show_versions()


INSTALLED VERSIONS
------------------
commit           : 66e3805b8cabe977f40c05259cc3fcf7ead5687d
python           : 3.8.12.final.0
python-bits      : 64
OS               : Linux
OS-release       : 5.13.0-51-generic
Version          : #58~20.04.1-Ubuntu SMP Tue Jun 14 11:29:12 UTC 2022
machine          : x86_64
processor        : x86_64
byteorder        : little
LC_ALL           : None
LANG             : en_US.UTF-8
LOCALE           : en_US.UTF-8

pandas           : 1.3.5
numpy            : 1.21.5
pytz             : 2021.3
dateutil         : 2.8.1
pip              : 21.2.4
setuptools       : 59.5.0
Cython           : 0.29.28
pytest           : None
hypothesis       : None
sphinx           : 4.4.0
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.8.0
html5lib         : None
pymysql          : 1.0.2
psycopg2         : 2.9.3 (dt dec pq3 ext lo64)
jinja2           : 3.0.3
IPython          : 7.31.1
pandas_datareader: None
bs4              : 4.11.1

In [4]:
df

Unnamed: 0,"10592604,""16"",""16530475"",""PROCESS_OF"",""C0003725"",""Arboviruses"",""virs"",""1"",""C0999630"",""Lepus capensis"",""mamm"",""1"",\N,\N,\N"
0,"10592697,""17"",""16530475"",""ISA"",""C0039258"",""Tah..."
1,"10592728,""17"",""16530475"",""ISA"",""C0318627"",""Eya..."
2,"10592759,""17"",""16530475"",""ISA"",""C0446169"",""Cal..."
3,"10592832,""18"",""16530475"",""PROCESS_OF"",""C001263..."
4,"10592873,""18"",""16530475"",""CAUSES"",""C0042776"",""..."
...,...
3961434,"76442302,""27482905"",""25318337"",""PROCESS_OF"",""C..."
3961435,"76442303,""27476761"",""24352628"",""LOCATION_OF"",""..."
3961436,"76442304,""27486552"",""24224652"",""PART_OF"",""C020..."
3961437,"76442305,""27487681"",""23545854"",""METHOD_OF"",""C1..."


In [6]:
sorted(glob(join(PREDICATION_DIR,'*.gz')))[6]

'PREDICATION/split_06.csv.gz'