In [99]:
import pandas as pd
from glob import glob
from os.path import join
import networkx as nx
from tqdm.notebook import tqdm
import json

PREDICATION_COLS = [
    'PREDICATION_ID', 'SENTENCE_ID', 'PMID', 'PREDICATE', 
    'SUBJECT_CUI', 'SUBJECT_NAME', 'SUBJECT_SEMTYPE', 'SUBJECT_NOVELTY', 'OBJECT_CUI',
    'OBJECT_NAME', 'OBJECT_SEMTYPE', 'OBJECT_NOVELTY', 'Nan1', 'Nan2', 'Nan3']

SENTENCE_DIR = 'SENTENCE'
PREDICATION_DIR = 'PREDICATION'
PREDICATION_AUX_DIR = 'PREDICATION_AUX'


In [100]:
def save_json(save_path, data):
    with open(save_path, 'w') as f:
        json.dump(data, f)
        
def read_json(save_path):
    with open(save_path, 'r') as f:
        data = json.load(f)
    return data

In [101]:
def canonize_cui(cui):
    cacon_form = 'C0000000'
    if len(cui)<8:
        cui = cacon_form[0:-len(cui)]+cui
    elif 'C' not in  cui:
        cui = 'C' + cui
    return cui

def process_mult_cui(cui):
    origin_cui = None
    all_cuis = []
    for cu in cui.split('|'):
        if origin_cui is None:
            cu = canonize_cui(cu)
            origin_cui = cu
        else:
#             cu = origin_cui[0:len(origin_cui)-len(cu)] + cu
            cu = origin_cui[0:-len(cu)] + cu
        all_cuis.append(cu)
    return all_cuis

def process_cui(cui):
    if '|' in cui:
        cuis = process_mult_cui(cui)
    else:
        cuis = [canonize_cui(cui)]
    return cuis

## Example

In [9]:
df = pd.read_csv('Format_sents/form_sent_849503.csv',compression = 'gzip')

In [10]:
df

Unnamed: 0,PREDICATION_AUX_ID,SENTENCE,FORMATED_SENTENCE,file_name
0,167307222,BACKGROUND: Two and a half years after commenc...,BACKGROUND: Two and a half years after commenc...,SENTENCE/split_990577.csv.gz
1,167307223,BACKGROUND: Two and a half years after commenc...,BACKGROUND: Two and a half years after commenc...,SENTENCE/split_990577.csv.gz
2,167307224,This subsequent outbreak provided the opportun...,This subsequent outbreak provided the opportun...,SENTENCE/split_990577.csv.gz
3,167307225,Children with rotavirus-confirmed gastroenteri...,@PREDICAT$ @OBJECT$ rotavirus-confirmed @SUBJE...,SENTENCE/split_990577.csv.gz
4,167307226,Nineteen (46%) of 41 case patients had receive...,Nineteen (46%) of 41 case patients had receive...,SENTENCE/split_990577.csv.gz
...,...,...,...,...
96,167307319,"However, the subtype PsA was more prevalent in...","However, the subtype PsA was more prevalent in...",SENTENCE/split_990577.csv.gz
97,167307320,CONCLUSION: In Sweden the prevalence of spondy...,CONCLUSION: In Sweden the prevalence of @SUBJE...,SENTENCE/split_990577.csv.gz
98,167307321,PsA was the most frequent subtype followed by ...,@SUBJECT$ was the most frequent subtype @OBJEC...,SENTENCE/split_990577.csv.gz
99,167307322,Magnetic resonance imaging of skeletal muscles...,@PREDICAT$ @OBJECT$ @SUBJECT$ in sporadic incl...,SENTENCE/split_990577.csv.gz


In [None]:
def vis_formating():

In [21]:
list(df.iloc[0,1:3])

['BACKGROUND: Two and a half years after commencing routine vaccination with human rotavirus vaccine, an outbreak of rotavirus G2P[4] infection occurred in central Australia.',
 'BACKGROUND: Two and a half years after commencing routine vaccination with @PREDICAT$@OBJECT$@SUBJECT$, an outbreak of rotavirus G2P[4] infection occurred in central Australia.']

In [15]:
list(df.iloc[2,1:3])

['This subsequent outbreak provided the opportunity to evaluate vaccine effectiveness against hospitalizations for a non-vaccine-related genotype in the same population.',
 'This subsequent outbreak provided the opportunity to evaluate vaccine effectiveness against @SUBJECT$ for a non-vaccine-related genotype @OBJECT$ the same @PREDICAT$.']

In [None]:
This ... against hospitalizations ... genotype in the same population
This ... against @SUBJECT$ ... genotype @OBJECT$ the same @PREDICAT$

In [16]:
list(df.iloc[3,1:3])

['Children with rotavirus-confirmed gastroenteritis were individually matched by date of birth and Indigenous status with 4 control subjects.',
 '@PREDICAT$ @OBJECT$ rotavirus-confirmed @SUBJECT$ were individually matched by date of birth and Indigenous status with 4 control subjects.']

In [None]:
Children with ... rotavirus-confirmed gastroenteritis were individually ...
@PREDICAT$ @OBJECT$ rotavirus-confirmed @SUBJECT$ were individually ...

In [17]:
list(df.iloc[4,1:3])

['Nineteen (46%) of 41 case patients had received 2 doses of human rotavirus vaccine, compared with 87 (53%) of 164 control subjects.',
 'Nineteen (46%) of 41 case patients had received 2 doses of @PREDICAT$@OBJECT$@SUBJECT$, compared with 87 (53%) of 164 control subjects.']

In [None]:
Nineteen ... human rotavirus vaccine, ....

In [None]:
Nineteen ... @PREDICAT$@OBJECT$@SUBJECT$, ...

In [18]:
list(df.iloc[5,1:3])

['Nineteen (46%) of 41 case patients had received 2 doses of human rotavirus vaccine, compared with 87 (53%) of 164 control subjects.',
 'Nineteen (46%) of 41 case @PREDICAT$ had @OBJECT$ 2 doses of human rotavirus @SUBJECT$, compared with 87 (53%) of 164 control subjects.']

In [None]:
Nineteen ... patients had received 2 doses of human rotavirus vaccine ...

In [None]:
Nineteen ... patients had received 2 doses of @PREDICAT$@OBJECT$@SUBJECT$, ...

In [None]:
Nineteen ... @PREDICAT$ had @OBJECT$ 2 doses of human rotavirus @SUBJECT$, ...

In [19]:
list(df.iloc[6,1:3])

['On secondary analysis, there was evidence of a protective effect against disease complicated by acidosis in the subset of infants aged <12 months (odds ratio, .15; 95% confidence interval, .03-.84).',
 'On secondary analysis, there was evidence of a protective effect against @PREDICAT$ @OBJECT$ by @SUBJECT$ in the subset of infants aged <12 months (odds ratio, .15; 95% confidence interval, .03-.84).']

In [20]:
list(df.iloc[7,1:3])

['CONCLUSIONS: Evidence was not found for an overall protective effect of human rotavirus vaccine against hospitalization for rotavirus disease in this setting.',
 'CONCLUSIONS: Evidence was not found for an overall protective effect of @PREDICAT$@OBJECT$@SUBJECT$ against hospitalization for rotavirus disease in this setting.']

In [7]:
all_paths = glob(join('Format_sents', '*'))

In [8]:
all_paths

['Format_sents/labeled_sent_336531.csv',
 'Format_sents/labeled_sent_880942.csv',
 'Format_sents/labeled_sent_374527.csv',
 'Format_sents/labeled_sent_444837.csv',
 'Format_sents/labeled_sent_227041.csv',
 'Format_sents/labeled_sent_414796.csv',
 'Format_sents/labeled_sent_789753.csv',
 'Format_sents/labeled_sent_454270.csv',
 'Format_sents/labeled_sent_216764.csv',
 'Format_sents/form_sent_849503.csv',
 'Format_sents/labeled_sent_935261.csv',
 'Format_sents/labeled_sent_558626.csv',
 'Format_sents/form_sent_638818.csv',
 'Format_sents/labeled_sent_969843.csv',
 'Format_sents/form_sent_521379.csv',
 'Format_sents/labeled_sent_955101.csv',
 'Format_sents/form_sent_409495.csv',
 'Format_sents/form_sent_427503.csv',
 'Format_sents/form_sent_205471.csv',
 'Format_sents/labeled_sent_224918.csv',
 'Format_sents/labeled_sent_244187.csv',
 'Format_sents/labeled_sent_566379.csv',
 'Format_sents/labeled_sent_873693.csv',
 'Format_sents/form_sent_159277.csv',
 'Format_sents/labeled_sent_145490.cs

## Viz interface Data

In [24]:
df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')

In [25]:
df.head()

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba
0,0,PROCESS_OF,C0003725,C0999630,1.0
1,1,ISA,C0039258,C0446169,1.0
2,2,ISA,C0318627,C0206590,1.0
3,3,ISA,C0446169,C0003725,1.0
4,4,PROCESS_OF,C0012634,C0020114,0.989018


In [27]:
FG = nx.from_pandas_edgelist(df, source='SUBJECT_CUI', target='OBJECT_CUI', create_using=nx.DiGraph())

In [28]:
FG

<networkx.classes.digraph.DiGraph at 0x7fb174f39b50>

In [108]:
save_json(join('all_triples', 'all_entity_eigenvector_centrality.json'), nx.eigenvector_centrality(FG))

In [107]:
max(list(nx.eigenvector_centrality(FG).values()))

0.08687484613292439

In [31]:
min(list(nx.degree_centrality(FG).values())

2.89863474303603e-06

In [32]:
max(list(nx.degree_centrality(FG).values()))

0.3727934143018638

## Node Names

In [34]:
all_files = sorted(glob(join(PREDICATION_DIR,'*.gz')))

In [91]:
def canonize_dict(cui_dict):
    fixed_dict = {}
    for cui in cui_dict:
        cuis = process_cui(cui)
        nots = cui_dict[cui].split('|')
        if (len(nots) == 1) & (len(cuis) > 1) :
            nots = nots*len(cuis)
        fixed_dict.update(dict(zip(cuis, nots)))
    return fixed_dict

def get_entity_names_types():
    all_files = sorted(glob(join(PREDICATION_DIR,'*.gz')))
    all_entity_names = {}
    all_entity_types = {}
    for f_name in tqdm(all_files):
        df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)
        cui_dict = canonize_dict(dict(zip(list(df['OBJECT_CUI']), list(df['OBJECT_NAME']))))
        all_entity_names.update(cui_dict)
        cui_dict = canonize_dict(dict(zip(list(df['OBJECT_CUI']), list(df['OBJECT_SEMTYPE']))))
        all_entity_types.update(cui_dict)
        cui_dict = canonize_dict(dict(zip(list(df['SUBJECT_CUI']), list(df['SUBJECT_NAME']))))
        all_entity_names.update(cui_dict)
        cui_dict = canonize_dict(dict(zip(list(df['SUBJECT_CUI']), list(df['SUBJECT_SEMTYPE']))))
        all_entity_types.update(cui_dict)
    return all_entity_names, all_entity_types



In [92]:
all_entity_names, all_entity_types = get_entity_names_types()

  0%|          | 0/30 [00:00<?, ?it/s]

  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)
  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)
  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)
  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)
  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)
  df = pd.read_csv(f_name,  compression= 'gzip', sep = ',', encoding='iso-8859-1', header=None, names = PREDICATION_COLS)


In [102]:
save_json(join('all_triples', 'all_entity_names.json'), all_entity_names)

In [103]:
save_json(join('all_triples', 'all_entity_types.json'), all_entity_types)

In [96]:
len(all_entity_names)

370641

In [95]:
len(all_entity_types)

370641

In [86]:
df[1]

{'C0999630': 'Lepus capensis',
 'C0446169': 'California Group Viruses',
 'C0206590': 'Coltivirus',
 'C0003725': 'JUN',
 'C0020114': 'Human',
 'C0012634': 'Disease',
 'C0003241': 'Antibodies',
 'C0002273': 'FHL1',
 'C0006034': 'Borrelia burgdorferi',
 'C0019878': 'homocysteine',
 'C0025646': 'Methionine',
 'C0017262': 'Gene Expression',
 'C0001675': 'CFD',
 'C0042567': 'Vertebrates',
 'C1167395': 'host',
 'C0003062': 'HCRTR2',
 'C0162318': 'Farm Animals',
 'C1136254': 'Microbicides',
 'C0949665': 'Fluoroquinolones',
 'C0013227': 'Pharmaceutical Preparations',
 'C0014061': 'Tick-Borne Encephalitis',
 'C0030705': 'Patients',
 'C0086287': 'Females',
 'C0086582': 'Males',
 'C1552130': '16s',
 'C0237401': 'Individual',
 'C1457887': 'Symptoms',
 'C0021311': 'Infection',
 'C0089701': '1-oleoyl-2-stearoylphosphatidylcholine',
 'C0012854': 'DNA',
 'C0456981': 'Specific antigen',
 'C0021368': 'Inflammation',
 'C0004368': 'Autoimmunity',
 'C0039194': 'T-Lymphocyte',
 'C0003315': 'HSPB1',
 'C067089

## Full Data

In [None]:
schema = {
        'source_col' : 'SUBJECT_CUI',
        'rel_col' : 'PREDICATE',
        'dest_col' : 'OBJECT_CUI',
        'source_score' : None,
        'rel_score' : None,
        'dest_score' : None,
        'source_type' : None,
        'rel_type' : None,
        'dest_type' : None,
        'triple_attr_col':None,
        'source_attr_col':None,
        'target_attr_col':None,
    }

In [111]:
def visual_triples():
    df = pd.read_csv(join('all_triples', 'triples_probabilities.csv'), compression = 'gzip')
    all_entity_names = read_json(join('all_triples', 'all_entity_names.json'))
    all_entity_types = read_json(join('all_triples', 'all_entity_types.json'))
    all_entity_ec = read_json(join('all_triples', 'all_entity_eigenvector_centrality.json'))
    df['SUBJECT_EC'] = df['SUBJECT_CUI'].map(all_entity_ec)
    df['OBJECT_EC'] = df['OBJECT_CUI'].map(all_entity_ec)
    df['SUBJECT_NAME'] = df['SUBJECT_CUI'].map(all_entity_names)
    df['OBJECT_NAME'] = df['OBJECT_CUI'].map(all_entity_names)
    df['SUBJECT_TYPE'] = df['SUBJECT_CUI'].map(all_entity_types)
    df['OBJECT_TYPE'] = df['OBJECT_CUI'].map(all_entity_types)
    return df

In [112]:
df = visual_triples()

In [113]:
df

Unnamed: 0,ORIGIN_ID,PREDICATE,SUBJECT_CUI,OBJECT_CUI,label_proba,SUBJECT_EC,OBJECT_EC,SUBJECT_NAME,OBJECT_NAME,SUBJECT_TYPE,OBJECT_TYPE
0,0,PROCESS_OF,C0003725,C0999630,1.000000,0.003565,0.000934,JUN,Lepus capensis (organism),gngm,mamm
1,1,ISA,C0039258,C0446169,1.000000,0.000257,0.000259,Tahyna virus,California Group Viruses,virs,virs
2,2,ISA,C0318627,C0206590,1.000000,0.000041,0.000223,Eyach virus,Coltivirus,virs,virs
3,3,ISA,C0446169,C0003725,1.000000,0.000259,0.003565,California Group Viruses,JUN,virs,gngm
4,4,PROCESS_OF,C0012634,C0020114,0.989018,0.079762,0.076038,Disease,Human,dsyn,grup
...,...,...,...,...,...,...,...,...,...,...,...
28416912,23857434,COEXISTS_WITH,C1413909,C1413914,1.000000,0.000561,0.000397,LAMA3,LAMB3,gngm,aapp
28416913,23857435,PROCESS_OF,C4023614,C1413909,1.000000,0.000016,0.000561,Digenic inheritance,LAMA3,genf,gngm
28416914,23857435,PROCESS_OF,C4023614,C1413909,1.000000,0.000016,0.000561,Digenic inheritance,LAMA3,genf,gngm
28416915,23857436,PROCESS_OF,C4023614,C1413914,1.000000,0.000016,0.000397,Digenic inheritance,LAMB3,genf,aapp


In [114]:
df.to_csv(join('all_triples', 'triples_probabilities_visual.csv'), index = False, compression = 'gzip')