In [4]:
import pandas as pd
from os.path import join
import json
import numpy as np

PROC_UMLS = '/home/pc/Desktop/AdilStuff/Projects/SemRepMed/umls processed net data'
SELECTED_TRIPLES = 'Selected Triples'
# COVID Complet  | risk factor covid kg
# SELECTED_KG = 'risk factor covid kg'
SELECTED_KG = 'risk_factor_disease_time'
TRIPLES_DIR = 'all_triples'

In [5]:
def save_json(path_, data_):
    with open(path_, 'w') as f:
        json.dump(data_, f)
        
def read_json(path_):
    with open(path_, 'r') as f:
        data = json.load(f)
    return data

In [11]:
def verify_splits(train_, test_, val_,
                  head_col = 'SUBJECT_CUI', rel_col = 'PREDICATE', tail_col = 'OBJECT_CUI',
                  proba_col = 'label_proba'):
    
    all_train_ents = set(list(train_[head_col]) + list(train_[tail_col]))
    all_train_rels = set(list(train_[rel_col]))
    total_triples = train_.shape[0]
    all_probas = list(train_[proba_col])
    if val_ is not None:
        all_val_ents = set(list(val_[head_col]) + list(val_[tail_col]))
        all_val_rels = set(list(val_[rel_col]))
        total_triples += val_.shape[0]
        all_probas += list(val_[proba_col])
    else:
        all_val_ents, all_val_rels = [], []
        
    if test_ is not None:
        all_test_ents = set(list(test_[head_col]) + list(test_[tail_col]))
        all_test_rels = set(list(test_[rel_col]))
        total_triples += test_.shape[0]
        all_probas += list(test_[proba_col])
    else:
        all_test_ents, all_test_rels = [], []
        
    print('total number of triples: ',total_triples)
    print('# all entities: ',len(set(list(all_val_ents)+list(all_test_rels)+list(all_train_ents))))
    print('# all relations: ',len(set(list(all_test_rels)+list(all_val_rels)+list(all_train_rels))))
    print('# Average proba: ', np.array(all_probas).mean())
    print('# Std proba: ', np.array(all_probas).std())
    
    print('number of training triples: ',train_.shape[0])
    if test_ is not None:
        print('number of test triples: ',test_.shape[0])
    if val_ is not None:
        print('number of val triples: ',val_.shape[0])
    print('fraction of training triples: ',train_.shape[0]/total_triples)
    if test_ is not None:
        print('fraction of test triples: ',test_.shape[0]/total_triples)
        
    if val_ is not None:
        print('fraction of val triples: ',val_.shape[0]/total_triples)
    
    if (train_ is not None) & (test_ is not None) & (val_ is not None):
        print('ents in val not train: ',len(all_val_ents.difference(all_train_ents)))
        print('ents in test not train: ',len(all_test_ents.difference(all_train_ents)))
        print('rels in val not train: ',len(all_val_rels.difference(all_train_rels)))
        print('rels in test not train: ',len(all_test_rels.difference(all_train_rels)))
    
def kg_split():
    all_df = pd.read_csv(join(SELECTED_TRIPLES, SELECTED_KG, 'full_kg.csv'), compression = 'gzip')
    train_df = all_df.groupby(['SUBJECT_CUI', 'OBJECT_CUI']).sample(frac = 0.56)
    test_val = all_df[~all_df.index.isin(train_df.index)]
    test_df = test_val.sample(frac = 0.5)
    val_df = test_val[~test_val.index.isin(test_df.index)]
    return train_df, test_df, val_df

def splits_formatting_save(train_, test_, val_, save_path):
    
    new_col_names = {'PREDICATE':'r','SUBJECT_CUI':'h','OBJECT_CUI':'t',
                     'label_proba':'p','SUBJECT_TYPE':'h_type','OBJECT_TYPE':'t_type'}
    train_.rename(new_col_names, axis = 1).to_csv(join(save_path, 'raw_train.csv'), index = False)
    test_.rename(new_col_names, axis = 1).to_csv(join(save_path, 'raw_test.csv'), index = False)
    val_.rename(new_col_names, axis = 1).to_csv(join(save_path, 'raw_val.csv'), index = False)
    all_train_ents = list(train_['SUBJECT_CUI']) + list(train_['OBJECT_CUI'])
    all_train_ents_types = list(train_['SUBJECT_TYPE']) + list(train_['OBJECT_TYPE'])
    all_train_rels = list(train_['PREDICATE'])
    
    all_val_ents = list(val_['SUBJECT_CUI']) + list(val_['OBJECT_CUI'])
    all_val_ents_types = list(val_['SUBJECT_TYPE']) + list(val_['OBJECT_TYPE'])
    all_val_rels = list(val_['PREDICATE'])
    
    all_test_ents = list(test_['SUBJECT_CUI']) + list(test_['OBJECT_CUI'])
    all_test_ents_types = list(test_['SUBJECT_TYPE']) + list(test_['OBJECT_TYPE'])
    all_test_rels = list(test_['PREDICATE'])
    
    all_ents = list(set(all_train_ents + all_val_ents + all_test_ents))
    all_rels = list(set(all_train_rels + all_val_rels + all_test_rels))
    all_ent_types = list(set(all_train_ents_types + all_val_ents_types + all_test_ents_types))
    ent_map = {e:idx for idx, e in enumerate(all_ents)}
    rel_map = {e:idx for idx, e in enumerate(all_rels)}
    ent_type_map = {e:idx for idx, e in enumerate(all_ent_types)}
    #####
    train_['SUBJECT_CUI'] = train_['SUBJECT_CUI'].map(ent_map)
    train_['OBJECT_CUI'] = train_['OBJECT_CUI'].map(ent_map)
    train_['SUBJECT_TYPE'] = train_['SUBJECT_TYPE'].map(ent_type_map)
    train_['OBJECT_TYPE'] = train_['OBJECT_TYPE'].map(ent_type_map)
    train_['PREDICATE'] = train_['PREDICATE'].map(rel_map)
    #
    test_['SUBJECT_CUI'] = test_['SUBJECT_CUI'].map(ent_map)
    test_['OBJECT_CUI'] = test_['OBJECT_CUI'].map(ent_map)
    test_['SUBJECT_TYPE'] = test_['SUBJECT_TYPE'].map(ent_type_map)
    test_['OBJECT_TYPE'] = test_['OBJECT_TYPE'].map(ent_type_map)
    test_['PREDICATE'] = test_['PREDICATE'].map(rel_map)
    #
    val_['SUBJECT_CUI'] = val_['SUBJECT_CUI'].map(ent_map)
    val_['OBJECT_CUI'] = val_['OBJECT_CUI'].map(ent_map)
    val_['SUBJECT_TYPE'] = val_['SUBJECT_TYPE'].map(ent_type_map)
    val_['OBJECT_TYPE'] = val_['OBJECT_TYPE'].map(ent_type_map)
    val_['PREDICATE'] = val_['PREDICATE'].map(rel_map)
    ####
    train_ = train_.rename(new_col_names, axis = 1)
    test_ = test_.rename(new_col_names, axis = 1)
    val_ = val_.rename(new_col_names, axis = 1)
    ## saving
    train_.to_csv(join(save_path, 'train.csv'), index = False)
    test_.to_csv(join(save_path, 'test.csv'), index = False)
    val_.to_csv(join(save_path, 'val.csv'), index = False)
    save_json(join(save_path, 'ent_map.json'), ent_map)
    save_json(join(save_path, 'rel_map.json'), rel_map)
    save_json(join(save_path, 'ent_type_map.json'), ent_type_map)
    return train_, test_, val_, ent_map, rel_map, ent_type_map

def test_read(save_path):
    train = pd.read_csv(join(save_path, 'train.csv'))
    test = pd.read_csv(join(save_path, 'test.csv'))
    val = pd.read_csv(join(save_path, 'val.csv'))
    raw_train = pd.read_csv(join(save_path, 'raw_train.csv'))
    raw_test = pd.read_csv(join(save_path, 'raw_test.csv'))
    raw_val = pd.read_csv(join(save_path, 'raw_val.csv'))
    ent_map = read_json(join(save_path, 'ent_map.json'))
    rel_map = read_json(join(save_path, 'rel_map.json'))
    ent_type_map = read_json(join(save_path, 'ent_type_map.json'))
    return raw_train, raw_test, raw_val, train, test, val, ent_map, rel_map, ent_type_map

In [16]:
train_df, test_df, val_df = kg_split()

In [17]:
verify_splits(train_df, test_df, val_df)

total number of triples:  280812
# all entities:  28175
# all relations:  61
# Average proba:  0.9011285565285632
# Std proba:  0.2386312469627831
number of training triples:  227311
number of test triples:  26750
number of val triples:  26751
fraction of training triples:  0.8094775152059028
fraction of test triples:  0.09525946184635985
fraction of val triples:  0.09526302294773728
ents in val not train:  0
ents in test not train:  0
rels in val not train:  0
rels in test not train:  0


In [19]:
train_df

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,time_min,time_max,SUBJECT_TYPE,OBJECT_TYPE
50293,990002,DISRUPTS,100125288,C0240173,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,aapp,fndg
50291,990002,DISRUPTS,100125288,C0240173,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,aapp,fndg
38289,672125,AFFECTS,100129339,C0237849,1.0,2015-08-21 00:00:00,2015-08-21 00:00:00,aapp,fndg
108375,3434195,ISA,100187907,C0033684,1.0,1996-06-01 00:00:00,1996-06-01 00:00:00,aapp,aapp
79308,1911236,CAUSES,100187907,C0231552,1.0,2016-09-28 00:00:00,2016-09-28 00:00:00,aapp,fndg
...,...,...,...,...,...,...,...,...,...
241536,6480149,compared_with,C1620098,C1620098,1.0,1986-08-01 00:00:00,1986-08-01 00:00:00,aapp,aapp
122035,3787703,ISA,C1620104,C0133195,1.0,1995-01-01 00:00:00,1995-11-01 00:00:00,aapp,aapp
239204,6457418,compared_with,C1621245,C0021463,0.5,1989-10-01 00:00:00,1989-10-01 00:00:00,aapp,aapp
485,4219,AFFECTS,C1653361,C0020538,1.0,2006-03-15 00:00:00,2006-03-15 00:00:00,aapp,dsyn


In [20]:
save_path = join(SELECTED_TRIPLES, SELECTED_KG)


In [21]:
pd.read_csv(join(save_path, 'raw_train.csv'))

Unnamed: 0,ORIGIN_ID,r,h,t,p,time_min,time_max,h_type,t_type
0,990002,DISRUPTS,100125288,C0240173,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,aapp,fndg
1,990002,DISRUPTS,100125288,C0240173,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,aapp,fndg
2,990002,DISRUPTS,100125288,C0240173,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,aapp,fndg
3,672125,AFFECTS,100129339,C0237849,1.0,2015-08-21 00:00:00,2015-08-21 00:00:00,aapp,fndg
4,3434195,ISA,100187907,C0033684,1.0,1996-06-01 00:00:00,1996-06-01 00:00:00,aapp,aapp
...,...,...,...,...,...,...,...,...,...
235535,6480149,compared_with,C1620098,C1620098,1.0,1986-08-01 00:00:00,1986-08-01 00:00:00,aapp,aapp
235536,3787703,ISA,C1620104,C0133195,1.0,1995-01-01 00:00:00,1995-11-01 00:00:00,aapp,aapp
235537,6457418,compared_with,C1621245,C0021463,0.5,1989-10-01 00:00:00,1989-10-01 00:00:00,aapp,aapp
235538,4219,AFFECTS,C1653361,C0020538,1.0,2006-03-15 00:00:00,2006-03-15 00:00:00,aapp,dsyn


In [22]:
splits_formatting_save(train_ = train_df.copy(), test_ = test_df.copy(), val_ = val_df.copy(), save_path = save_path)

(        ORIGIN_ID   r      h      t    p             time_min  \
 50293      990002  18  16158  24362  1.0  2015-08-22 00:00:00   
 50291      990002  18  16158  24362  1.0  2015-08-22 00:00:00   
 38289      672125  24  10724  13513  1.0  2015-08-21 00:00:00   
 108375    3434195  36  11060  13729  1.0  1996-06-01 00:00:00   
 79308     1911236  50  11060  10474  1.0  2016-09-28 00:00:00   
 ...           ...  ..    ...    ...  ...                  ...   
 241536    6480149  39  13007  13007  1.0  1986-08-01 00:00:00   
 122035    3787703  36   5360  21347  1.0  1995-01-01 00:00:00   
 239204    6457418  39  12329  12410  0.5  1989-10-01 00:00:00   
 485          4219  24  25312   3754  1.0  2006-03-15 00:00:00   
 255586    6623675  16    486  13686  1.0  1985-04-01 00:00:00   
 
                    time_max  h_type  t_type  
 50293   2015-08-22 00:00:00       8      16  
 50291   2015-08-22 00:00:00       8      16  
 38289   2015-08-21 00:00:00       8      16  
 108375  1996-06-0

In [23]:
save_path = join(SELECTED_TRIPLES, SELECTED_KG)
raw_train, raw_test, raw_val, train, test, val, ent_map, rel_map, ent_type_map = test_read(save_path)

In [24]:
train

Unnamed: 0,ORIGIN_ID,r,h,t,p,time_min,time_max,h_type,t_type
0,990002,18,16158,24362,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,8,16
1,990002,18,16158,24362,1.0,2015-08-22 00:00:00,2015-08-22 00:00:00,8,16
2,672125,24,10724,13513,1.0,2015-08-21 00:00:00,2015-08-21 00:00:00,8,16
3,3434195,36,11060,13729,1.0,1996-06-01 00:00:00,1996-06-01 00:00:00,8,8
4,1911236,50,11060,10474,1.0,2016-09-28 00:00:00,2016-09-28 00:00:00,8,16
...,...,...,...,...,...,...,...,...,...
227306,6480149,39,13007,13007,1.0,1986-08-01 00:00:00,1986-08-01 00:00:00,8,8
227307,3787703,36,5360,21347,1.0,1995-01-01 00:00:00,1995-11-01 00:00:00,8,8
227308,6457418,39,12329,12410,0.5,1989-10-01 00:00:00,1989-10-01 00:00:00,8,8
227309,4219,24,25312,3754,1.0,2006-03-15 00:00:00,2006-03-15 00:00:00,8,5


In [None]:
len(rel_map)

In [None]:
len(ent_type_map)

In [None]:
len(ent_map)

In [None]:
rel_map

## Get Stats

In [32]:
save_path = join(SELECTED_TRIPLES, SELECTED_KG)
raw_train, raw_test, raw_val, train, test, val, ent_map, rel_map, ent_type_map = test_read(save_path)

In [34]:
pd.read_csv(join(save_path, 'raw_train.csv'))

Unnamed: 0,ORIGIN_ID,r,h,t,p,h_type,t_type
0,22599141,LOCATION_OF,100037417,C0017262,1.0,aapp,genf
1,12378912,PRODUCES,100125288,C0000820,1.0,aapp,aapp
2,12378912,PRODUCES,100125288,C0000820,1.0,aapp,aapp
3,12378912,PRODUCES,100125288,C0000820,1.0,aapp,aapp
4,15221147,ASSOCIATED_WITH,100125288,C0002395,1.0,aapp,dsyn
...,...,...,...,...,...,...,...
2997450,23778360,compared_with,C5549863,C0021747,0.0,topp,aapp
2997451,23778359,TREATS,C5549863,C0524910,1.0,topp,dsyn
2997452,23805050,TREATS,C5549863,C1444092,1.0,topp,virs
2997453,23805051,TREATS,C5549863,C1623038,1.0,topp,dsyn


In [35]:
verify_splits(raw_train, raw_test, raw_val,
                  head_col = 'h', rel_col = 'r', tail_col = 't',
                  proba_col = 'p')

total number of triples:  3843218
# all entities:  79651
# all relations:  62
# Average proba:  0.9036127702849742
# Std proba:  0.26119983467200864
number of training triples:  2997455
number of test triples:  422882
number of val triples:  422881
fraction of training triples:  0.7799336389452797
fraction of test triples:  0.11003331062666755
fraction of val triples:  0.11003305042805274
ents in val not train:  0
ents in test not train:  0
rels in val not train:  0
rels in test not train:  0


## Total KG

In [44]:
proba_triples_df = pd.read_csv(join(TRIPLES_DIR, 'triples_probabilities.csv'), compression = 'gzip')

In [48]:
proba_triples_df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.0
1,1,ISA,C0039258,C0446169,1.0
2,2,ISA,C0318627,C0206590,1.0
3,3,ISA,C0446169,C0003725,1.0
4,4,PROCESS_OF,C0012634,C0020114,0.989018


In [49]:
verify_splits(proba_triples_df, None, None,
                  head_col = 'SUBJECT_CUI', rel_col = 'PREDICATE', tail_col = 'OBJECT_CUI',
                  proba_col = 'label_proba')

total number of triples:  28416917
# all entities:  344991
# all relations:  68
# Average proba:  0.9131144540427637
# Std proba:  0.25031801450326685
number of training triples:  28416917
fraction of training triples:  1.0


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

## Triples Exploration

### COVID19 Risk Factors

In [None]:
SELECTED_KG = 'risk factor covid kg'
save_path = join(SELECTED_TRIPLES, SELECTED_KG)
train, test, val, ent_map, rel_map, ent_type_map = test_read(save_path)

In [None]:
verify_splits(train, test, val, 'h', 'r', 't', 'p')

### COVID19 Time Risk Factors

In [12]:
SELECTED_KG = 'risk_factor_disease_time'
save_path = join(SELECTED_TRIPLES, SELECTED_KG)
raw_train, raw_test, raw_val, train, test, val, ent_map, rel_map, ent_type_map = test_read(save_path)

In [13]:
verify_splits(train, test, val, 'h', 'r', 't', 'p')

total number of triples:  280812
# all entities:  28115
# all relations:  61
# Average proba:  0.901128556528563
# Std proba:  0.23863124696278307
number of training triples:  227311
number of test triples:  26750
number of val triples:  26751
fraction of training triples:  0.8094775152059028
fraction of test triples:  0.09525946184635985
fraction of val triples:  0.09526302294773728
ents in val not train:  0
ents in test not train:  0
rels in val not train:  0
rels in test not train:  0


## Relational Patterns

In [None]:
save_path = join(SELECTED_TRIPLES, SELECTED_KG)
rel_map = read_json(join(save_path, 'rel_map.json'))

In [None]:
rel_map = {k.lower():v for k, v in rel_map.items()}

In [None]:
def get_relational_patterns(rel_map):
    tran_comp = read_json(join(PROC_UMLS, 'umls_biokg_rel_patterns.json'))
    # transitive relations
    transitive_rels = list(map(rel_map.get, tran_comp['transitive_rels']))
    transitive_rels = [i for i in transitive_rels if i is not None]
    # composition relations
    composition_rels = [list(map(rel_map.get, i)) for i in tran_comp['composition_rels']]
    composition_rels = [i for i in composition_rels if (None not in i) and (len(set(i))==3)]
    save_path = join(SELECTED_TRIPLES, SELECTED_KG, 'relational_patterns.json')
    save_json(save_path, {'composition_rels':composition_rels, 'transitive_rels':transitive_rels})
    return transitive_rels, composition_rels


In [None]:
get_relational_patterns(rel_map)

In [None]:
get_relational_patterns(rel_map)