In [2]:
import os
import pandas as pd
import csv
import json
import requests as req

def switch_dictset_to_dictlist(d):
    d2 = dict()
    for k,v in d.items():
        d2[k] = list(v)
    return d2

### Process input files

In [40]:
headers = ['head', 'relationship', 'tail', 'score']

#### Disease MeSH tree
Disease MeSH Tree - subclass of - Disease MeSH Tree

In [41]:
# Input 
df = pd.read_csv('../input/Disease_(MeSH_Tree)_2_Disease_(MeSH_Tree).csv')

# Output
cvd_df = df[df['Disease (MeSH Tree)'].str.startswith('MeSH_Tree_Disease:C14')]
cvd_df.columns = ['MeSH_Tree_Disease_h', 'MeSH_Tree_Disease_t', 'Relationship']
cvd_df = cvd_df[['MeSH_Tree_Disease_h', 'Relationship', 'MeSH_Tree_Disease_t']]
cvd_df['Score'] = [1.0]*len(cvd_df)
cvd_df.columns = headers
cvd_df.to_csv('../data/mesh_disease_to_mesh_disease.csv')
cvd_df.tail()

Unnamed: 0,head,relationship,tail,score
7972,MeSH_Tree_Disease:C14.907.940.910,-subclass_of->,MeSH_Tree_Disease:C14.907.940,1.0
7973,MeSH_Tree_Disease:C14.907.946,-subclass_of->,MeSH_Tree_Disease:C14.907,1.0
7974,MeSH_Tree_Disease:C14.907.952,-subclass_of->,MeSH_Tree_Disease:C14.907,1.0
7975,MeSH_Tree_Disease:C14.907.952.760,-subclass_of->,MeSH_Tree_Disease:C14.907.952,1.0
7976,MeSH_Tree_Disease:C14.907.952.880,-subclass_of->,MeSH_Tree_Disease:C14.907.952,1.0


#### CaseOLAP
Protein - CaseOLAP_Score - Disease Name

In [42]:
# Input 
caseolap_dict = json.load(open('../input/os_cvd_caseolap.json'))

# Output
out_path = '../data/protein_to_disease.tsv'
with open(out_path, 'w') as fout: 
    writer = csv.writer(fout, delimiter='\t')
    writer.writerow(headers)
    for disease, protein_to_score_d in caseolap_dict.items():
        for protein, score in protein_to_score_d.items():
            writer.writerow(['Disease_Name:'+disease, '-CaseOLAP_Score-', 'Protein:'+protein, score])
            
caseolap_edge_df = pd.read_table(out_path)
caseolap_edge_df.tail()

Unnamed: 0,head,relationship,tail,score
1771,Disease_Name:OHD,-CaseOLAP_Score-,Protein:Q9NT68,0.0
1772,Disease_Name:OHD,-CaseOLAP_Score-,Protein:Q14766,0.0
1773,Disease_Name:OHD,-CaseOLAP_Score-,Protein:P61073,0.029786
1774,Disease_Name:OHD,-CaseOLAP_Score-,Protein:P55008,0.008804
1775,Disease_Name:OHD,-CaseOLAP_Score-,Protein:P10916,0.062273


#### disease category to mesh tree
ABBREV - is - Disease MeSH Tree

In [43]:
### Input
dis_cats = json.load(open('../input/cvd/textcube_config.json'))
with open('../input/categories.txt') as fin:
    mesh_trees = [line.strip().split(' ') for line in fin.readlines()]

### Output 
out_path = '../data/category_to_mesh_tree.tsv'
with open(out_path, 'w') as fout:
    writer = csv.writer(fout, delimiter = '\t')
    writer.writerow(headers)
    cat_to_trees = dict(zip(dis_cats, mesh_trees))
    for cat, trees in cat_to_trees.items():
        for tree in trees:
            writer.writerow(['Disease:'+cat, '-cat_is_mesh-', 'MeSH_Tree_Disease:'+tree, 1.0])
            
cat_to_mesh_edge_df = pd.read_table(out_path)
cat_to_mesh_edge_df.tail()

Unnamed: 0,head,relationship,tail,score
11,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.383,1.0
12,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.470,1.0
13,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.945,1.0
14,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.459,1.0
15,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.720,1.0


### Merged Edges

In [50]:
final_edge_path = '../data/caseolap_edge_list.tsv'
edge_df = caseolap_edge_df.append(cvd_df).append(cat_to_mesh_edge_df)
edge_df.to_csv(final_edge_path, sep='\t', index=False)
edge_df = pd.read_table(final_edge_path)
edge_df.tail()

Unnamed: 0,head,relationship,tail,score
2499,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.383,1.0
2500,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.470,1.0
2501,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.945,1.0
2502,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.459,1.0
2503,Disease:OHD,-cat_is_mesh-,MeSH_Tree_Disease:C14.280.720,1.0


### Merged Nodes

In [60]:
# Input
nodes = set(edge_df['head']).union(edge_df['tail'])

# Output
node_path = '../data/caseolap_node_list.tsv'
with open(node_path, 'w') as fout:
    writer = csv.writer(fout, delimiter = '\t')
    writer.writerow(['node', 'node_type'])
    for node in nodes:
        node_type = node.split(':')[0]
        writer.writerow([node, node_type])

node_df = pd.read_table(node_path).tail()
node_df.tail()

Unnamed: 0,node,node_type
947,MeSH_Tree_Disease:C14.280.123.500,MeSH_Tree_Disease
948,MeSH_Tree_Disease:C14.907.253.573.800,MeSH_Tree_Disease
949,MeSH_Tree_Disease:C14.280.647.187.575,MeSH_Tree_Disease
950,MeSH_Tree_Disease:C14.907.253.573.400.450.050,MeSH_Tree_Disease
951,Protein:O95967,Protein


### Protein Subset File
Using all proteins here

In [63]:
proteins = [node for node in nodes if node.startswith('Protein:')]
protein_subset_outfile = '../data/ca_proteins.txt'
with open(protein_subset_outfile, 'w') as fout:
    for protein in proteins:
        fout.write(protein+'\n')

In [64]:
caseolap_dict

{'CM': {'O15296': 0.019324618428428992,
  'Q9GZY6': 0.0,
  'Q05586': 0.0,
  'O75923': 0.06328173734409184,
  'P0DUB6': 0.0,
  'P19801': 0.023640126086834592,
  'P37288': 0.03133690555033059,
  'P09486': 0.05870150375960224,
  'P07911': 0.012995432677105,
  'Q6UXI9': 0.0,
  'P01130': 0.03991853083474544,
  'Q92736': 0.09825772836107488,
  'O76038': 0.0,
  'P0DP25': 0.0,
  'Q14520': 0.0,
  'P13497': 0.008282284938015761,
  'P20800': 0.026897831883016414,
  'Q14114': 0.0,
  'Q9GZU1': 0.0,
  'P0DP23': 0.0,
  'Q9Y680': 0.0,
  'P48740': 0.0,
  'Q13201': 0.0,
  'P08637': 0.0,
  'Q04721': 0.008294360256728528,
  'P05186': 0.008294360256728528,
  'O14917': 0.0,
  'Q96L12': 0.01667518519518369,
  'Q92629': 0.1009997732181845,
  'P23142': 0.02272009656517397,
  'P55283': 0.0,
  'P46531': 0.03952636826869464,
  'Q8WZ42': 0.11745034649735298,
  'Q8WWQ8': 0.0,
  'P53805': 0.03310758770860603,
  'P14543': 0.008297279057156623,
  'P51681': 0.09097131264457788,
  'Q9NY15': 0.03036571166433748,
  'P9816

In [71]:
! python prepare_caseolap_graph.py -g os_cvd