In [4]:
import networkx as nx
import pandas as pd


# Definitions


In [5]:
definitions = pd.read_json('../data/definitions.json', typ='split')
using_nodes = ["Gene","Compound","Disease"]

#metanodes -"Gene","Compound","Disease"
for node in using_nodes:
    print(f'{node} : {definitions["metanodes"].get(node)}')

Gene : Protein-coding human genes. From Entrez Gene.
Compound : Approved small molecule compounds with documented chemical structures. From DrugBank.
Disease : Complex diseases, selected to be distinct and specific enough to be clinically relevant yet general enough to be well annotated. From Disease Ontology.


In [6]:
not_using_nodes = [node for node in definitions["metanodes"].keys() if node not in using_nodes]

In [7]:
using_edges = []

for edge in definitions["metaedges"].keys():
    e = edge.split('–')
    if (e[0] not in not_using_nodes) and (e[-1] not in not_using_nodes):
        using_edges.append(edge.replace('–',' - '))
        print(f'{edge} : {definitions["metaedges"].get(edge)}','\n')

Compound–binds–Gene : The Compound physically binds to the Gene's protein product according to BindingDB, DrugBank, or DrugCentral 

Compound–downregulates–Gene : The Gene is under-expressed in samples exposed to the Compound according to a consensus signature from LINCS L1000 

Compound–palliates–Disease : The Compound is a symptomatic indication for the Disease (i.e. the Compound treats a significant symptom of the Disease) according to PharmacotherapyDB 

Compound–resembles–Compound : The Compounds are chemical similar to each other: Their extended connectivity fingerprints have a Dice coefficient ≥ 0.5. 

Compound–treats–Disease : The Compound is a disease-modifying indication for the Disease (i.e. the Compound therapeutically changes the underlying or downstream biology of the Disease) according to PharmacotherapyDB 

Compound–upregulates–Gene : The Gene is over-expressed in samples exposed to the Compound according to a consensus signature from LINCS L1000 

Disease–associates–Ge

# nodes


In [8]:
metanodes = pd.read_csv('../data/metanodes.tsv', sep='\t')
metanodes.query('metanode in @using_nodes')

Unnamed: 0,metanode,abbreviation,metaedges,nodes,unconnected_nodes
3,Compound,C,8,1552,14
4,Disease,D,8,137,1
5,Gene,G,16,20945,1800


In [9]:
nodes = pd.read_csv('../data/hetionet-v1.0-nodes.tsv', sep='\t')

In [10]:
nodes.query('kind in @using_nodes', inplace=True)

# nodes['id'] = nodes['id'].str.split('::').str[-1]

In [11]:
nodes

#### id is the node identifier prepended with the node type plus :: as a separator. 
#### name is the node name. 
#### kind is the node type.

Unnamed: 0,id,name,kind
13174,Compound::DB00014,Goserelin,Compound
13175,Compound::DB00035,Desmopressin,Compound
13176,Compound::DB00050,Cetrorelix,Compound
13177,Compound::DB00091,Cyclosporine,Compound
13178,Compound::DB00093,Felypressin,Compound
...,...,...,...
35803,Gene::9991,PTBP3,Gene
35804,Gene::9992,KCNE2,Gene
35805,Gene::9993,DGCR2,Gene
35806,Gene::9994,CASP8AP2,Gene


In [12]:
compound = nodes[nodes['id'].str.contains('Compound')]['id'].to_list()
gene = nodes[nodes['id'].str.contains('Gene')]['id'].to_list()
disease = nodes[nodes['id'].str.contains('Disease')]['id'].to_list()

# edges

In [13]:
metaedges = pd.read_csv('../data/metaedges.tsv', sep='\t')
metaedges.query('metaedge in @using_edges | metaedge == "Gene > regulates > Gene"',inplace=True)
metaedges

Unnamed: 0,metaedge,abbreviation,edges,source_nodes,target_nodes,unbiased
3,Compound - binds - Gene,CbG,11571,1389,1689,0
5,Compound - downregulates - Gene,CdG,21102,734,2880,21102
6,Compound - palliates - Disease,CpD,390,221,50,0
7,Compound - resembles - Compound,CrC,6486,1042,1054,6486
8,Compound - treats - Disease,CtD,755,387,77,0
9,Compound - upregulates - Gene,CuG,18756,703,3247,18756
10,Disease - associates - Gene,DaG,12623,134,5392,1284
11,Disease - downregulates - Gene,DdG,7623,44,5745,7623
14,Disease - resembles - Disease,DrD,543,112,106,0
15,Disease - upregulates - Gene,DuG,7731,44,5630,7731


In [14]:
edge_file = '../data/edges.sif'
edges = pd.read_csv(edge_file, sep='\t',)

In [15]:
edges = pd.merge(metaedges[['metaedge','abbreviation']], edges, left_on='abbreviation', right_on='metaedge')
edges.drop('metaedge_y', axis=1, inplace=True)
edges

Unnamed: 0,metaedge_x,abbreviation,source,target
0,Compound - binds - Gene,CbG,Compound::DB00514,Gene::1136
1,Compound - binds - Gene,CbG,Compound::DB00686,Gene::2246
2,Compound - binds - Gene,CbG,Compound::DB00786,Gene::4317
3,Compound - binds - Gene,CbG,Compound::DB01209,Gene::4988
4,Compound - binds - Gene,CbG,Compound::DB01588,Gene::2560
...,...,...,...,...
562101,Gene > regulates > Gene,Gr>G,Gene::5184,Gene::8659
562102,Gene > regulates > Gene,Gr>G,Gene::22818,Gene::5613
562103,Gene > regulates > Gene,Gr>G,Gene::29888,Gene::5467
562104,Gene > regulates > Gene,Gr>G,Gene::5894,Gene::595


# label

In [16]:
connected_compound_disease = edges[(edges['source'].str.contains('Compound')) & (edges['target'].str.contains('Disease'))]
connected_compound_disease

Unnamed: 0,metaedge_x,abbreviation,source,target
32673,Compound - palliates - Disease,CpD,Compound::DB01175,Disease::DOID:3312
32674,Compound - palliates - Disease,CpD,Compound::DB00321,Disease::DOID:7148
32675,Compound - palliates - Disease,CpD,Compound::DB00176,Disease::DOID:594
32676,Compound - palliates - Disease,CpD,Compound::DB01037,Disease::DOID:10652
32677,Compound - palliates - Disease,CpD,Compound::DB00945,Disease::DOID:9074
...,...,...,...,...
40299,Compound - treats - Disease,CtD,Compound::DB00860,Disease::DOID:8893
40300,Compound - treats - Disease,CtD,Compound::DB01265,Disease::DOID:2043
40301,Compound - treats - Disease,CtD,Compound::DB01030,Disease::DOID:1324
40302,Compound - treats - Disease,CtD,Compound::DB01101,Disease::DOID:5041


In [17]:
connected_compound_disease.drop(['metaedge_x', 'abbreviation'],axis=1, inplace=True)
connected_compound_disease = connected_compound_disease.assign(label = 1)
connected_compound_disease

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  connected_compound_disease.drop(['metaedge_x', 'abbreviation'],axis=1, inplace=True)


Unnamed: 0,source,target,label
32673,Compound::DB01175,Disease::DOID:3312,1
32674,Compound::DB00321,Disease::DOID:7148,1
32675,Compound::DB00176,Disease::DOID:594,1
32676,Compound::DB01037,Disease::DOID:10652,1
32677,Compound::DB00945,Disease::DOID:9074,1
...,...,...,...
40299,Compound::DB00860,Disease::DOID:8893,1
40300,Compound::DB01265,Disease::DOID:2043,1
40301,Compound::DB01030,Disease::DOID:1324,1
40302,Compound::DB01101,Disease::DOID:5041,1


In [18]:
connected_compound_disease_list = list(zip(connected_compound_disease['source'],connected_compound_disease['target']))

# 그래프 생성

In [19]:
import networkx as nx

G = nx.Graph()

G.add_nodes_from(compound,group='compound', weight=1)
G.add_nodes_from(gene,group='gene', weight=0.1)
G.add_nodes_from(disease,group='disease', weight=10)

In [20]:
# 엣지 추가 및 가중치 부여
for _, edge in edges.iterrows():
    source = edge['source']
    target = edge['target']
    weight = 2 if G.nodes[source]['group'] == 'compound' and G.nodes[target]['group'] == 'disease' else 0.1
    G.add_edge(source, target, weight=weight)

In [99]:
# #DWPC

# def calculate_dwpc(graph, start_node, end_node):
#     dwpc = 0.0

#     # Find all simple paths from start_node to end_node
#     all_paths = list(nx.all_simple_paths(graph, start_node, end_node))

#     for path in all_paths:
#         # Calculate the sum of inverse degrees for each intermediate node
#         intermediate_nodes = path[1:-1]
#         dwpc += sum(1 / graph.degree(node) for node in intermediate_nodes)

#     return dwpc


# # Update the graph with edge weights
# for source, target, weight in edge_weights:
#     G.add_edge(source, target, weight=weight)

# # 엣지 추가 및 가중치 부여
# for _, edge in edges.iterrows():
#     source = edge['source']
#     target = edge['target']
#     weight = calculate_dwpc(G,source,target)
#     G.add_edge(source, target, weight=weight)

In [100]:
print(G['Compound::DB00514']['Gene::1136'])

{'weight': 0.1}


In [101]:
print(G['Compound::DB01175']['Disease::DOID:3312'])

{'weight': 2}


# Node2Vec

In [102]:
# from node2vec import Node2Vec
# # Node2Vec 모델 생성 및 학습

# # bfs
# node2vec_bfs = Node2Vec(G, dimensions=60, walk_length=30, num_walks=200, workers=4, p=0.0001, q=1)
# model_bfs = node2vec_bfs.fit(window=10, min_count=1)

In [103]:
# # Save embeddings for later use
# model_bfs.wv.save_word2vec_format('./model_Save_embeddings_bfs')

# # Save model for later use
# model_bfs.save('./model_Save_model_bfs')

In [104]:
# from node2vec import Node2Vec
# Node2Vec 모델 생성 및 학습

# dfs
# node2vec = Node2Vec(G, dimensions=60, walk_length=30, num_walks=200, workers=4, p=1.0, q=0.0001)
# model = node2vec.fit(window=10, min_count=1)

In [105]:
# # Save embeddings for later use
# model.wv.save_word2vec_format('./model_Save_embeddings')

# # Save model for later use
# model.save('./model_Save_model')

In [27]:
from node2vec import Node2Vec
# Node2Vec 모델 생성 및 학습

# 지정x
node2vec_x = Node2Vec(G, dimensions=60, walk_length=30, num_walks=200, workers=4)
model_x = node2vec_x.fit(window=10, min_count=1)

Computing transition probabilities:   0%|          | 0/22634 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 50/50 [41:23<00:00, 49.66s/it]
Generating walks (CPU: 2): 100%|██████████| 50/50 [41:41<00:00, 50.03s/it]
Generating walks (CPU: 3): 100%|██████████| 50/50 [41:23<00:00, 49.67s/it]
Generating walks (CPU: 4): 100%|██████████| 50/50 [40:14<00:00, 48.28s/it]


In [29]:
# Save embeddings for later use
model_x.wv.save_word2vec_format('./model_Save_embeddings_x')

# Save model for later use
model_x.save('./model_Save_model_x')

#### *임베딩, 모델 각각 저장됨 - 불러와서 사용

# link prediction

In [21]:
from gensim.models import KeyedVectors
Save_embeddings = KeyedVectors.load_word2vec_format("model_Save_embeddings")

In [30]:
from gensim.models import Word2Vec

# p=0.0001, q=1 (bfs)
Save_model_bfs = Word2Vec.load('./model_Save_model_bfs')
# p=1.0, q=0.0001 (dfs)
Save_model_dfs = Word2Vec.load('./model_Save_model')
# 지정x
Save_model_x = Word2Vec.load('./model_Save_model_x')

In [31]:
bfs_q1 = [Save_model_bfs.wv.similarity(com,dis) for com,dis in connected_compound_disease_list]
dfs_p1 = [Save_model_dfs.wv.similarity(com,dis) for com,dis in connected_compound_disease_list]
x = [Save_model_x.wv.similarity(com,dis) for com,dis in connected_compound_disease_list]
compound_disease_similarity = pd.DataFrame(data={'bfs_q1':bfs_q1, 'dfs_p1':dfs_p1, 'x':x})
compound_disease_similarity.describe()

Unnamed: 0,bfs_q1,dfs_p1,x
count,1145.0,1145.0,1145.0
mean,0.602098,0.782286,0.803298
std,0.115631,0.102518,0.112476
min,0.17327,0.42686,0.428203
25%,0.528952,0.726834,0.737291
50%,0.602524,0.804462,0.821349
75%,0.678686,0.858445,0.890742
max,0.891915,0.979788,0.987413


In [24]:
# dfs 선택
# 중앙값을 임계값으로 link prediction
predict = []
for com in compound:
    for dis in disease:
        if Save_model_dfs.wv.similarity(com,dis) > 0.8:
            predict.append((com,dis))

In [26]:
# recall,precision
TP = [1 for i in predict if i in connected_compound_disease_list]
TP = sum(TP)
FP = len(predict) - TP

recall = TP / 1145
precision = TP / len(predict)
f1_score = 2 * (precision * recall) / (precision + recall)

print(recall,precision,f1_score)

0.5187772925764192 0.8354430379746836 0.6400862068965518


# 데이터

In [None]:
#cartesian product
df1 = pd.DataFrame({'source':compound})
df2 = pd.DataFrame({'target':disease})

compound_disease = pd.merge(df1.assign(key=1),df2.assign(key=1), on='key').drop('key', axis=1)

#label
compound_disease = pd.merge(compound_disease, connected_compound_disease, how='outer')
compound_disease = compound_disease.fillna(0)
compound_disease.drop('com_dis_similarity', axis=1,inplace=True)
compound_disease

Unnamed: 0,source,target,label
0,Compound::DB00014,Disease::DOID:0050156,0.0
1,Compound::DB00014,Disease::DOID:0050425,0.0
2,Compound::DB00014,Disease::DOID:0050741,0.0
3,Compound::DB00014,Disease::DOID:0050742,0.0
4,Compound::DB00014,Disease::DOID:0060073,0.0
...,...,...,...
212619,Compound::DB09028,Disease::DOID:9744,0.0
212620,Compound::DB09028,Disease::DOID:9835,0.0
212621,Compound::DB09028,Disease::DOID:986,0.0
212622,Compound::DB09028,Disease::DOID:9917,0.0


In [None]:
compound_disease['label'].value_counts()

0.0    211479
1.0      1145
Name: label, dtype: int64

##### -> label bias가 심하다

In [None]:
# # input테이블 저장
# compound_disease.to_csv('compound_disease.csv')