Objectives:
- creating a uncertain temporal knowledge graph

In [19]:
import pandas as pd
from os.path import join
import json
SELECTED_TRIPLES = 'Selected Triples'
TRIPLES_DIR = 'all_triples'

## Utils

In [20]:
def write_json_lines(file_name,dict_data):
    json_string = json.dumps(dict_data)
    with open(file_name, 'a') as f:
        f.write(json_string+"\n")
        
def read_json_lines(file_name):
    lines = []
    with open(file_name) as file_in:
        for line in file_in:
            lines.append(json.loads(line))
    return lines

## Temporal UKGE Construction

In [3]:
def get_biotukg():
    proba_triples_df = pd.read_csv(join(TRIPLES_DIR, 'triples_probabilities.csv'), compression = 'gzip')
    time_df = pd.DataFrame(read_json_lines(join(TRIPLES_DIR, 'origin_time.jsonl')))
    tmin_map = dict(zip(time_df['ORIGIN_ID'], time_df['time_min']))
    tmax_map = dict(zip(time_df['ORIGIN_ID'], time_df['time_max']))
    proba_triples_df['time_min'] = proba_triples_df['ORIGIN_ID'].map(tmin_map)
    proba_triples_df['time_max'] = proba_triples_df['ORIGIN_ID'].map(tmax_map)
    return proba_triples_df

In [4]:
tukg = get_biotukg()

In [5]:
tukg

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,time_min,time_max
0,0,PROCESS_OF,C0003725,C0999630,1.000000,2006-03-15 00:00:00,2006-03-15 00:00:00
1,1,ISA,C0039258,C0446169,1.000000,1991-05-01 00:00:00,2006-03-15 00:00:00
2,2,ISA,C0318627,C0206590,1.000000,2006-03-15 00:00:00,2006-03-15 00:00:00
3,3,ISA,C0446169,C0003725,1.000000,1988-04-01 00:00:00,2006-03-15 00:00:00
4,4,PROCESS_OF,C0012634,C0020114,0.989018,1947-10-01 00:00:00,2017-07-01 00:00:00
...,...,...,...,...,...,...,...
28416912,23857434,COEXISTS_WITH,C1413909,C1413914,1.000000,,
28416913,23857435,PROCESS_OF,C4023614,C1413909,1.000000,,
28416914,23857435,PROCESS_OF,C4023614,C1413909,1.000000,,
28416915,23857436,PROCESS_OF,C4023614,C1413914,1.000000,,


In [6]:
tukg[(tukg['time_min']!= '') & (tukg['time_max']!= '')]

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,time_min,time_max
0,0,PROCESS_OF,C0003725,C0999630,1.000000,2006-03-15 00:00:00,2006-03-15 00:00:00
1,1,ISA,C0039258,C0446169,1.000000,1991-05-01 00:00:00,2006-03-15 00:00:00
2,2,ISA,C0318627,C0206590,1.000000,2006-03-15 00:00:00,2006-03-15 00:00:00
3,3,ISA,C0446169,C0003725,1.000000,1988-04-01 00:00:00,2006-03-15 00:00:00
4,4,PROCESS_OF,C0012634,C0020114,0.989018,1947-10-01 00:00:00,2017-07-01 00:00:00
...,...,...,...,...,...,...,...
26252552,22159704,METHOD_OF,C0282493,C0005558,1.000000,1985-01-01 00:00:00,1985-01-01 00:00:00
26252603,22159974,PROCESS_OF,C0021359,C3903827,1.000000,1993-01-01 00:00:00,1993-01-01 00:00:00
26253743,22162665,LOCATION_OF,C1702260,C1515670,1.000000,2015-11-10 00:00:00,2015-11-10 00:00:00
26253744,22162665,LOCATION_OF,C1702260,C1515670,1.000000,2015-11-10 00:00:00,2015-11-10 00:00:00


In [7]:
tukg[(tukg['time_min']!= '') & (tukg['time_max']!= '')].to_csv(join(TRIPLES_DIR, 'triples_probabilities_time.csv'), compression = 'gzip', index = False)

## Get Sub Graph and Split

In [8]:
tukg = pd.read_csv(join(TRIPLES_DIR, 'triples_probabilities_time.csv'), compression = 'gzip')
ent_types = ['aapp', 'dsyn', 'virs', 'fndg', 'topp', 'hlca', 'eehu', 'ocac', 'aggp', 'dora', 'orga', 'socb', 'imft', 'hops', 'famg', 'genf', 'geoa']


In [12]:
def save_json(path_, data_):
    with open(path_, 'w') as f:
        json.dump(data_, f)
        
def read_json(path_):
    with open(path_, 'r') as f:
        data = json.load(f)
    return data

def get_sub_schema_triples(str_un_mod_df, ent_types):
    # the objective of this function is the selection of the different
    # 
    ent_2_type_map = read_json(join('all_triples', 'all_entity_types.json'))
    # entity to name 
    ent_2_name_map = read_json(join('all_triples', 'all_entity_names.json'))
    # structural and uncertain modality data
    str_un_mod_df['SUBJECT_TYPE'] = str_un_mod_df['SUBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df['OBJECT_TYPE'] = str_un_mod_df['OBJECT_CUI'].map(ent_2_type_map)
    str_un_mod_df = str_un_mod_df[str_un_mod_df['SUBJECT_TYPE'].isin(ent_types) & str_un_mod_df['OBJECT_TYPE'].isin(ent_types)]
    return str_un_mod_df


In [13]:
tukg.shape

(2323547, 7)

In [14]:
tukg = get_sub_schema_triples(tukg, ent_types)

In [15]:
tukg.shape

(280812, 9)

In [17]:
tukg

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,time_min,time_max,SUBJECT_TYPE,OBJECT_TYPE
1,1,ISA,C0039258,C0446169,1.000000,1991-05-01 00:00:00,2006-03-15 00:00:00,virs,virs
2,2,ISA,C0318627,C0206590,1.000000,2006-03-15 00:00:00,2006-03-15 00:00:00,virs,virs
5,5,CAUSES,C0042776,C0012634,0.924340,1966-01-01 00:00:00,2016-12-05 00:00:00,virs,dsyn
12,11,PRODUCES,C0007523,C0019878,1.000000,2006-03-15 00:00:00,2006-03-15 00:00:00,aapp,aapp
62,55,TREATS,C1533685,C0015780,1.000000,1985-09-01 00:00:00,2015-08-21 00:00:00,topp,orga
...,...,...,...,...,...,...,...,...,...
2323441,8372060,COEXISTS_WITH,C0024420,C0162745,1.000000,1974-05-01 00:00:00,1974-05-01 00:00:00,aapp,aapp
2323457,8373046,INTERACTS_WITH,C0242210,C0025736,1.000000,1973-03-01 00:00:00,1973-03-01 00:00:00,aapp,hops
2323484,15291076,METHOD_OF,C0002423,C1522577,1.000000,2016-09-14 00:00:00,2016-09-14 00:00:00,hlca,hlca
2323516,21032185,COEXISTS_WITH,C0184661,C0220825,0.957143,1989-01-01 00:00:00,1989-01-01 00:00:00,topp,hlca


In [21]:
tukg.to_csv(join(SELECTED_TRIPLES, 'risk_factor_disease_time', 'full_kg.csv'), compression = 'gzip', index= False)

In [None]:
train_df, test_df, val_df = kg_split()

In [None]:
SELECTED_KG = 'risk_factor_disease_time'
save_path = join(SELECTED_TRIPLES, SELECTED_KG)
train, test, val, ent_map, rel_map, ent_type_map = test_read(save_path)