## PMID-MeSH-Protein Network Data

### PMID to Protein Edge Data

In [2]:
import pandas as pd
import json as json

**Entitycount data in CVD data**

In [11]:
pmid2p = {}
with open("../caseolap-cvd/data/entitycount.txt",'r') as f:
    for line in f:
        pmid = line.split(" ")[0]
        data = line.split(" ")[1:]
        proteins = []
        for item in data:
            proteins.append(item.split("|")[0])
        
        pmid2p.update({pmid:proteins})

In [12]:
len(pmid2p.keys())

13524

**Entitycount data in OS data**

In [13]:
with open("../caseolap-os/data/entitycount.txt",'r') as f:
    for line in f:
        pmid = line.split(" ")[0]
        if pmid not in pmid2p:
            data = line.split(" ")[1:]
            proteins = []
            for item in data:
                proteins.append(item.split("|")[0])
            
       
            pmid2p.update({pmid:proteins})

In [14]:
len(pmid2p.keys())

94702

In [15]:
with open("data/pmid2p.json",'w')as f12:
    json.dump(pmid2p,f12)

**PMID to Proteins Graph Data**

In [83]:
DATA = []
for key,value in pmid2p.items():
    for protein in value:
        DATA.append({"pmid":key,"protein":protein,"edge" : "MENTIONS"})

In [84]:
df = pd.DataFrame(DATA)
df.head()

Unnamed: 0,pmid,protein,edge
0,20091048,P56539,MENTIONS
1,30517097,P11532,MENTIONS
2,27853260,P16615,MENTIONS
3,19432907,P11532,MENTIONS
4,24898986,O00555,MENTIONS


In [85]:
df.to_csv("kgdata/pmid2protein-edge.csv")

### PMID Nodes Data

In [48]:
import sys
import json
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q
from collections import Counter

In [49]:
allpmids = df['pmid']

In [50]:
"""
Search and count entities: to optimize and find count from indexer
"""
es = Elasticsearch(timeout=300)
k = 0
Data = []
for item in allpmids:
        s = Search(using=es, index="pubmed")\
                    .params(request_timeout=300)\
                    .query("match_phrase",pmid=item)
        
        
        for hit in s.scan():
                
                pmid = str(hit.pmid)
                title = str(hit.title)
                abstract = str(hit.abstract)
                mesh = hit.MeSH
                date = hit.date
            
                Data.append({"pmid":pmid,\
                             "title":title,\
                             "abstract":abstract,\
                             "mesh" : mesh,\
                             "date": date})  
        
        k = k +1
        if k%5000 == 0:
            print(k,'entity counted!')
            

5000 entity counted!
10000 entity counted!
15000 entity counted!
20000 entity counted!
25000 entity counted!
30000 entity counted!
35000 entity counted!
40000 entity counted!
45000 entity counted!
50000 entity counted!
55000 entity counted!
60000 entity counted!
65000 entity counted!
70000 entity counted!
75000 entity counted!
80000 entity counted!
85000 entity counted!
90000 entity counted!
95000 entity counted!
100000 entity counted!
105000 entity counted!


In [51]:
df2 = pd.DataFrame(Data)
df2.head(1)

Unnamed: 0,pmid,title,abstract,mesh,date
0,20091048,Cardiac sodium channelopathies.,cardiac sodium channel are protein complexes t...,"[Animals, Arrhythmias, Cardiac, genetics, phys...","{'Year': '2010', 'Month': 'Jul', 'Day': '', 'S..."


In [52]:
df2.to_csv("kagdata/allpmid-nodes.csv")

In [53]:
df.shape

(106674, 3)

### PMID to MeSH Edge Data

In [72]:
with open("../MeSH/name2id.json", 'r') as ff:
    name2id = json.load(ff)

In [74]:
DATA = []
for item in Data:
    pmid = item['pmid']
    mesh = item['mesh']
    #print(pmid,mesh[0])
   
    for m in mesh:
        m = m.lower()
        try:
            ID = name2id[m]
        except:
            ID = m
        DATA.append({"pmid":pmid,"mesh":m, "meshtree_id":ID})  

In [75]:
df3 = pd.DataFrame(DATA)
df3.head(2)

Unnamed: 0,pmid,mesh,meshtree_id
0,20091048,animals,B01.050
1,20091048,"arrhythmias, cardiac",C23.550.073


In [76]:
df3.to_csv("kgdata/pmid2mesh-all-edge.csv")

### MeSH node data

##### CVD Category and MeSH data

In [17]:
with open("data/name2id.json", "r")as f0:
    name2id = json.load(f0)

In [25]:
with open("data/id2name.json", "r")as ff:
    id2name = json.load(ff)

In [18]:
with open("data/cvd_cat2mesh.json", "r")as f1:
    cvd_cat2mesh = json.load(f1)

In [19]:
allmesh = {}
for cat, meshes in cvd_cat2mesh.items():
    for m in meshes:
        allmesh.update({name2id[m]:m})   

#### Add OS category MeSH data

In [28]:
with open('data/os-categories.txt','r')as f2:
    for line in f2:
        meshes = line[0:-1].split(" ")
        for m in meshes:
            try:
                mname = id2name[m]
            except:
                continue
                
            allmesh.update({m:mname})

In [40]:
len(allmesh.keys())

254

#### Select unique mesh node Data

In [32]:
merged_mesh = []
for k,v in allmesh.items():
    merged_mesh.append({"mid":k, "name":v})

In [36]:
mdf = pd.DataFrame(merged_mesh)
mdf.head(1)

Unnamed: 0,mid,name
0,C14.280.238,cardiomyopathies


In [37]:
mdf.to_csv("kgdata/merged-mesh-nodes.csv")

#### PMID to Mesh Edge Data

In [38]:
df = pd.read_csv("kgdata/pmid2mesh-all-edge.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,pmid,mesh,meshtree_id
0,0,20091048,animals,B01.050
1,1,20091048,"arrhythmias, cardiac",C23.550.073
2,2,20091048,genetics,H01.158.273.343
3,3,20091048,physiopathology,physiopathology
4,4,20091048,brugada syndrome,C16.320.100


In [39]:
df.shape

(2901140, 4)

In [41]:
selected_edges = [] 
for pmid,name,mid in zip(df["pmid"],df['mesh'],df["meshtree_id"]):
    if mid in allmesh:
        selected_edges.append({"pmid": pmid,"name": name,"mid": mid})

In [43]:
dfe = pd.DataFrame(selected_edges)
dfe.head(3)

Unnamed: 0,pmid,name,mid
0,20091048,"arrhythmias, cardiac",C23.550.073
1,20091048,brugada syndrome,C16.320.100
2,20091048,"cardiomyopathy, dilated",C16.320.488.750


In [44]:
dfe.shape

(59306, 3)

In [45]:
dfe.to_csv("kgdata/pmid2mesh-merged-edge.csv")