In [24]:
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
import pickle
import networkx as nx
import openne
import openne.node2vec
from openne.graph import *
import scipy.sparse as sp
import random
random.seed(32)
np.random.seed(32)

In [25]:
#有方法：degree_graph、load_net、getNodelist、save_sampledEdges、extract_edges、extractEdgesByRatioList

"""
功能: 统计网络的度分布
输入: networkx格式的图G
返回: 各节点度
"""
# adj_matrix为scipy_sparse_matrix
def degree_graph(G):
    # G = nx.from_scipy_sparse_matrix(adj_matrix)
    degree = G.degree()  # 返回所有节点的度
    sum_degree = 0
    for elem in degree:
        sum_degree += elem[1]
    avl_degree = sum_degree / len(degree)
    #print("网络平均度：" + str(round(avl_degree, 4)))
    degree_his = nx.degree_histogram(G)  # 返回图中所有节点的度分布序列
    # print(degree_his)
    x = range(len(degree_his))  # 生成x轴序列，从1到最大度
    # 将频次转换为频率，这用到Python的一个小技巧：列表内涵）
    y = [z / float(sum(degree_his)) for z in degree_his]  
    plt.title("The distribution of degree\nAverage degree:" + str(round(avl_degree, 4)))
    plt.loglog(x, y, "b.")  # 在双对数坐标轴上绘制度分布曲线color="blue", linewidth=2
#     plt.savefig('figures/degree_his.png')
    plt.show()
    plt.close()
    return degree

"""
#name2index：节点ID与标号对应表
功能: 添加网络
输入: 
    fileName:文件名
    start_index:开始序号
    name2index1:节点类型1的id到index的映射
    index2name1:节点类型1的index到id的映射
    name2index2:节点类型2的id到index的映射
    index2name2:节点类型2的id到index的映射
    edges_index_set:以index形式保存的列表
    edges_name_set:以name形式保存的列表
返回: 返回当前最后一个节点的index
"""
def load_net(fileName, start_index, name2index1, index2name1, name2index2, index2name2, edges_index_set, edges_name_set):
    lines = 0
    with open(fileName, 'r') as f:
        for line in f:
            lines += 1
            words = line.split()
            name1 = words[0]
            if name1 in name2index1:
                index1 = name2index1[name1]
            else:
                index1 = start_index
                name2index1[name1] = index1
                index2name1[index1] = name1
                start_index += 1
                
            name2 = words[1]
            if name2 in name2index2:
                index2 = name2index2[name2]
            else:
                index2 = start_index
                name2index2[name2] = index2
                index2name2[index2] = name2
                start_index += 1
            edges_index_set.add((index1, index2))
            edges_name_set.add((name1, name2))
    print("End index {}".format(start_index))
    return start_index
"""
#name2index：节点ID与标号对应表
功能: 获取网络中节点
输入: 
    fileName:文件名
    name2index1:节点类型1的id到index的映射
    name2index2:节点类型2的id到index的映射
返回: 节点列表
"""
def getNodelist(fileName, name2index1, name2index2):
    nodeList1 = set()
    nodeList2 = set()
    with open(fileName, 'r') as f:
        for line in f:
            words = line.split()
            name1 = words[0]
            name2 = words[1]
            nodeList1.add(name2index1[name1])
            nodeList2.add(name2index2[name2])
#     print("{}\n{}".format(len(nodeList1),len(nodeList2)))
    return list(nodeList1), list(nodeList2)

"""
功能: 保存正负用例（边列表）
    保存格式为：node1 node2 flag
    flag为1表示正例，为0表示负例
输入: 
    save_path:保存文件名
    pos_edges:正边集合
    neg_edges:负边集合
返回: 无
"""
def save_sampledEdges(save_path,pos_edges,neg_edges):
    with open(save_path, 'w') as f:
        for node1,node2 in pos_edges:
            temp=str(node1)+" "+str(node2)+" "+str(1)+"\n"
            f.write(temp)
        for node1,node2 in neg_edges:
            temp=str(node1)+" "+str(node2)+" "+str(0)+"\n"
            f.write(temp)

"""
功能: 按比例从已经存在的边列表中提取正负例
输入: 
    edges_set:list:为网络中已有连边的集合,一般为疾病-基因网络的边表
    sample_ratio:正负例比例
    save_path:保存路径
返回: 提取后的正负用例pos_edges:list,neg_edges:list
"""            
def extract_edges(edges_set,sample_ratio,save_path):
#     dis_nodes,gene_nodes=getNodelist(dis_gene_file,disNI,geneNI)
    #提取正例
    pos_edges=list(edges_set)
    neg_nums=(int)(sample_ratio*len(pos_edges))
    neg_edges=[]
    dis_nodes=set()#list(set(pos_edges[:][0]))
    gene_nodes=set()#list(set(pos_edges[:][1]))
    for edge in pos_edges:
        dis_nodes.add(edge[0])
        gene_nodes.add(edge[1])
    dis_nodes=list(dis_nodes)
    gene_nodes=list(gene_nodes)
    count=0
    while count<neg_nums:
        index1=random.choice(dis_nodes)
        index2=random.choice(gene_nodes)
        edge=(index1,index2)
        if edge not in pos_edges and edge not in neg_edges:
            neg_edges.append(edge)
            count+=1
    
    save_edges(save_path,pos_edges,neg_edges)
    return pos_edges,neg_edges

"""
功能: 按比例提取正负例
输入: 
    edges_set:list:为网络中已有连边的集合,一般为疾病-基因网络的边表
    sample_ratio:正负例比例列表
    save_path_prefix:保存路径前缀
返回: 提取后的正负用例pos_edges:list,neg_edges:list
"""   
def extractEdgesByRatioList(edges_set,sample_ratios,save_path_prefix):
#     dis_nodes,gene_nodes=getNodelist(dis_gene_file,disNI,geneNI)
    #提取正例
    pos_edges=list(edges_set)
    neg_edges=[]
    dis_nodes=set()#list(set(pos_edges[:][0]))
    gene_nodes=set()#list(set(pos_edges[:][1]))
    for edge in pos_edges:
        dis_nodes.add(edge[0])
        gene_nodes.add(edge[1])
    dis_nodes=list(dis_nodes)
    gene_nodes=list(gene_nodes)
    for sample_ratio in sample_ratios:
        neg_nums=(int)(sample_ratio*len(pos_edges))
        count=0
        while count<neg_nums:
            index1=random.choice(dis_nodes)
            index2=random.choice(gene_nodes)
            edge=(index1,index2)
            if edge not in pos_edges and edge not in neg_edges:
                neg_edges.append(edge)
                count+=1
        save_path=save_path_prefix+str(sample_ratio)+'.txt'
        save_edges(save_path,pos_edges,neg_edges)
#     return pos_edges,neg_edges

In [26]:
"""
功能: 保存正负用例（边列表）
    保存格式为：node1 node2 flag
    flag为1表示正例，为0表示负例
输入: 
    save_path:保存文件名
    pos_edges:正边集合
    neg_edges:负边集合
返回: 无
"""
def save_edges(save_path,pos_edges,neg_edges):
    count=0
    with open(save_path, 'w') as f:
        for edge in neg_edges:
            count+=1
            temp=str(edge[0])+" "+str(edge[1])+" "+str(0)+"\n"
            f.write(temp)
        for edge in pos_edges:
            count+=1
            temp=str(edge[0])+" "+str(edge[1])+" "+str(1)+"\n"
            f.write(temp)
        print(count)
            
"""
功能: 提取一定数量的负例（本数据集总数量为130820）
输入: 
    G:网络
    sampleNegNum:提取数量
返回: 负边列表（存储格式为:(node1,node2)）
"""
def sampleNegativeEdge(G,sampleRatio=0.3,totalNum=130820):
    np.random.seed(123456)
    # get all positive edges
    #all_edges = graph.edges()
    sampleNegNum=int(sampleRatio*totalNum)
    graph=G
    all_edges = G.edges()
    all_edges = list(set(all_edges))
    samples_negEdges=[]
    nodes1=set()
    nodes2=set()
    for edge in all_edges:
        nodes1.add(edge[0])
        nodes2.add(edge[1])
    nodes1=list(nodes1)
    nodes2=list(nodes2)
    count=0
    while count<sampleNegNum:
        index1=random.choice(nodes1)
        index2=random.choice(nodes2)
        edge=(index1,index2)
        #print(edge)
        if edge not in all_edges and edge not in samples_negEdges:
            samples_negEdges.append(edge)
            count+=1
    return samples_negEdges
    
def mst_drop(G, drop_ratio=0.3):
    """randomly drop some edge that already exist, and keep connection.
    return: G: graph after process
            drop_pos: ndarray of droped edges
    """
    np.random.seed(123456)
    # get all positive edges
    all_pos_edges = G.edges()
    all_pos_edgeset = set(all_pos_edges)
    total = len(all_pos_edgeset)
    
    # generate minimum spanning tree's edges
    from networkx.algorithms import tree
    mst = tree.minimum_spanning_edges(G, algorithm='kruskal', data=False)
    edgelist = list(mst)
    edgeset = set(tuple(sorted(edge)) for edge in edgelist)
    
    # all_pos - mst_edgelist
    other_pos_edgeset = all_pos_edgeset.difference(edgeset)
    other_pos_edges = np.asarray(list(other_pos_edgeset))
    other = len(other_pos_edgeset)
    
    if other <= total * drop_ratio:
        drop_pos = other_pos_edges
        G.remove_edges_from(other_pos_edges)
    else:
        drop_pos_idx = np.random.choice(np.arange(other), int(total * drop_ratio), replace=False)
        drop_pos = other_pos_edges[drop_pos_idx]
        G.remove_edges_from(drop_pos)
    return G, drop_pos

def random_drop(G, drop_ratio=0.3):
    np.random.seed(123456)
    # get all positive edges
    all_pos_edges = list(set(G.edges()))
    drop_pos=[]
    #np.asarray(all_pos_edges)
    total = len(all_pos_edges)
    num_drop = total * drop_ratio
    index_list = [i for i in range(total)]
    index = random.sample(index_list, int(num_drop))
    for idx in index:
        drop_pos.append(all_pos_edges[idx])
    G.remove_edges_from(drop_pos)
    np.asarray(drop_pos)
    return G, drop_pos

In [27]:
# 节点索引转换字典
disIN = {}          # key: dis id, value: dis name
disNI = {}          # key: dis name, value: dis id
geneIN = {}         # key: gene id, value: gene name
geneNI = {}         # key: gene name, value: gene id
sympIN = {}         # key: symptom id, value: symptom name
sympNI = {}         # key: symptom name, value: symptom idg
goIN = {}         # key: go id, value: go name
goNI = {}         # key: go name, value: go id
data_path='original_data/'
dis_gene_file = data_path+'DisGeNet_dis_gene.txt' #疾病基因网络
dis_symp_file = data_path+'HPO&Orphanet_dis_symp.txt' #疾病表型网络
gene_go_file = data_path+'homo_gene_GO.txt' #基因本体网络
ppi = data_path+'blab_ppi2016.txt' #ppi网络
index = 0
lines = 0

#blab_ppi2016.txt
ppi_edges_index = set()
ppi_edges_name = set()
index=load_net(ppi, index, geneNI, geneIN, geneNI, geneIN, ppi_edges_index, ppi_edges_name)
print("ppi_network:\nindex: {}\nedges_name: {}\nedges_index: {}\ngeneNI: {}\ngeneNI: {}\n".format(
    index, len(ppi_edges_name), len(ppi_edges_index), len(geneNI), len(geneIN)))

#DisGeNet_dis_gene.txt
dis_gene_edges_index = set()
dis_gene_edges_name = set()
index=load_net(dis_gene_file, index, disNI, disIN, geneNI, geneIN, dis_gene_edges_index, dis_gene_edges_name)
print("DisGeNet_dis_gene\nindex: {}\nedges_name: {}\nedges_index: {}\ndisNI: {}\ndisIN: {}\n".format(
    index, len(dis_gene_edges_name),len(dis_gene_edges_index), len(disNI), len(disIN)))

#HPO&Orphanet_dis_symp.txt
dis_symp_edges_index = set()
dis_symp_edges_name = set()
index=load_net(dis_symp_file, index, disNI, disIN, sympIN, sympNI, dis_symp_edges_index, dis_symp_edges_name)
print("HPO&Orphanet_dis_symp\nindex: {}\nedges_name: {}\nedges_index: {}\nsympIN: {}\nsympNI: {}\n".format(
    index, len(dis_symp_edges_name),len(dis_symp_edges_index), len(sympIN), len(sympNI)))

#homo_gene_GO.txt
gene_go_edges_index = set()
gene_go_edges_name = set()
index=load_net(gene_go_file, index, geneNI, geneIN, goNI, goIN, gene_go_edges_index, gene_go_edges_name)
print("homo_gene_GO\nindex: {}\nedges_name: {}\nedges_index: {}\ngoNI: {}\ngoIN: {}\n".format(
    index, len(gene_go_edges_name), len(gene_go_edges_index), len(goNI), len(goIN)))

dis_nodes,gene_nodes=getNodelist(dis_gene_file,disNI,geneNI)
print("disease nums:{}\ngene nums:{}".format(len(dis_nodes),len(gene_nodes)))

End index 15964
ppi_network:
index: 15964
edges_name: 213888
edges_index: 213888
geneNI: 15964
geneNI: 15964

End index 30929
DisGeNet_dis_gene
index: 30929
edges_name: 130820
edges_index: 130820
disNI: 13074
disIN: 13074

End index 39425
HPO&Orphanet_dis_symp
index: 39425
edges_name: 99087
edges_index: 99087
sympIN: 6540
sympNI: 6540

End index 57358
homo_gene_GO
index: 57358
edges_name: 218337
edges_index: 218337
goNI: 14204
goIN: 14204

disease nums:13074
gene nums:8947


In [28]:
dis_gene_network = nx.Graph() # 创建无向图
dis_gene_network.add_edges_from(list(dis_gene_edges_index))
saveSampleConnectivePath="network_edgelist/connectiveSample_edges0.3.txt"
neg_edges=sampleNegativeEdge(dis_gene_network,sampleRatio=0.3)
dis_gene_network_connective, pos_edges = mst_drop(dis_gene_network, drop_ratio=0.3)
pos_edges=list(map(lambda x:tuple(x),pos_edges))
save_edges(saveSampleConnectivePath,pos_edges,neg_edges)
# nx.write_edgelist(remainGraph, "network_edgelist/dg_sampledGraph_connective_adjlist.txt", delimiter=' ',data=False)
#nx.write_adjlist(remainGraph,"./network_edgelist/dg_sampledGraph_connective_adjlist.txt")
print(len(list(dis_gene_network_connective.edges)))
print(len(pos_edges))
print(len(neg_edges))

78492
91574
39246
39246


In [29]:
dis_gene_network = nx.Graph() # 创建无向图
dis_gene_network.add_edges_from(list(dis_gene_edges_index))
saveSampleRandomPath="network_edgelist/randomSample_edges0.3.txt"
dis_gene_network_random, pos_edges_rand = random_drop(dis_gene_network, drop_ratio=0.3)
save_edges(saveSampleRandomPath,pos_edges_rand,neg_edges)
print(len(list(dis_gene_network_random.edges)))
print(len(pos_edges))
print(len(neg_edges))

78492
91574
39246
39246


In [30]:
#不同组合的异构网络
#d:disease,g:gene,s:symptom,p:protein,o:GO
dg_edges=set(dis_gene_network_connective.edges())
dgs_edges=dg_edges.union(dis_symp_edges_index)
dgp_edges=dg_edges.union(ppi_edges_index)
dgpo_edges=dgp_edges.union(gene_go_edges_index)
dgsp_edges=dgs_edges.union(ppi_edges_index)
all_edges=dgsp_edges.union(gene_go_edges_index)

dg_network = nx.Graph()
dg_network.add_edges_from(list(dg_edges))
dgs_network = nx.Graph()
dgs_network.add_edges_from(list(dgs_edges))
dgp_network = nx.Graph()
dgp_network.add_edges_from(list(dgp_edges))
dgpo_network = nx.Graph()
dgpo_network.add_edges_from(list(dgpo_edges))
dgsp_network = nx.Graph()
dgsp_network.add_edges_from(list(dgsp_edges))
all_network = nx.Graph()
all_network.add_edges_from(list(all_edges))

nx.write_adjlist(dg_network, "network_edgelist/dg_network_adjlist_connective.txt")
nx.write_adjlist(dgs_network, "network_edgelist/dgs_network_adjlist_connective.txt")
nx.write_adjlist(dgp_network, "network_edgelist/dgp_network_adjlist_connective.txt")
nx.write_adjlist(dgpo_network, "network_edgelist/dgpo_network_adjlist_connective.txt")
nx.write_adjlist(dgsp_network, "network_edgelist/dgsp_network_adjlist_connective.txt")
nx.write_adjlist(all_network, "network_edgelist/all_network_adjlist_connective.txt")

In [31]:
#不同组合的异构网络
#d:disease,g:gene,s:symptom,p:protein,o:GO
dg_edges=set(dis_gene_network_random.edges())
dgs_edges=dg_edges.union(dis_symp_edges_index)
dgp_edges=dg_edges.union(ppi_edges_index)
dgpo_edges=dgp_edges.union(gene_go_edges_index)
dgsp_edges=dgs_edges.union(ppi_edges_index)
all_edges=dgsp_edges.union(gene_go_edges_index)

dg_network = nx.Graph()
dg_network.add_edges_from(list(dg_edges))
dgs_network = nx.Graph()
dgs_network.add_edges_from(list(dgs_edges))
dgp_network = nx.Graph()
dgp_network.add_edges_from(list(dgp_edges))
dgpo_network = nx.Graph()
dgpo_network.add_edges_from(list(dgpo_edges))
dgsp_network = nx.Graph()
dgsp_network.add_edges_from(list(dgsp_edges))

nx.write_adjlist(dg_network, "network_edgelist/dg_network_adjlist_random.txt")
nx.write_adjlist(dgs_network, "network_edgelist/dgs_network_adjlist_random.txt")
nx.write_adjlist(dgp_network, "network_edgelist/dgp_network_adjlist_random.txt")
nx.write_adjlist(dgpo_network, "network_edgelist/dgpo_network_adjlist_random.txt")
nx.write_adjlist(dgsp_network, "network_edgelist/dgsp_network_adjlist_random.txt")
nx.write_adjlist(all_network, "network_edgelist/all_network_adjlist_random.txt")