In [1]:
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
import pickle
import networkx as nx
import openne
import openne.node2vec
from openne.graph import *
import scipy.sparse as sp
import random
random.seed(32)
np.random.seed(32)

In [2]:
#加载网络
def loadNet(fileName):
    edges=set()
    with open(fileName, 'r') as f:
        for line in f:
            words = line.split()
            edges.add((words[0], words[1]))
    return edges

"""
#name2index：节点ID与标号对应表
功能: 获取网络中节点
输入: 
    fileName:文件名
    name2index:节点id到index的映射
返回: 节点列表
"""
def getNodelist(fileName):
    nodeList1 = set()
    nodeList2 = set()
    with open(fileName, 'r') as f:
        for line in f:
            words = line.split()
            nodeList1.add(words[0])
            nodeList2.add(words[1])
    return list(nodeList1), list(nodeList2)

"""
功能: 保存正负用例（边列表）
    保存格式为：node1 node2 flag
    flag为1表示正例，为0表示负例
输入: 
    save_path:保存文件名
    pos_edges:正边集合
    neg_edges:负边集合
返回: 无
"""
def save_sampledEdges(save_path,pos_edges,neg_edges):
    with open(save_path, 'w') as f:
        for node1,node2 in pos_edges:
            temp=str(node1)+" "+str(node2)+" "+str(1)+"\n"
            f.write(temp)
        for node1,node2 in neg_edges:
            temp=str(node1)+" "+str(node2)+" "+str(0)+"\n"
            f.write(temp)

"""
功能: 保存正负用例（边列表）
    保存格式为：node1 node2 flag
    flag为1表示正例，为0表示负例
输入: 
    save_path:保存文件名
    pos_edges:正边集合
    neg_edges:负边集合
返回: 无
"""
def save_edges(save_path,pos_edges,neg_edges):
    count=0
    with open(save_path, 'w') as f:
        for edge in neg_edges:
            count+=1
            temp=str(edge[0])+" "+str(edge[1])+" "+str(0)+"\n"
            f.write(temp)
        for edge in pos_edges:
            count+=1
            temp=str(edge[0])+" "+str(edge[1])+" "+str(1)+"\n"
            f.write(temp)
        print(count)
            
"""
功能: 提取一定数量的负例（本数据集总数量为130820）
输入: 
    G:网络
    sampleNegNum:提取数量
返回: 负边列表（存储格式为:(node1,node2)）
"""
def sampleNegativeEdge(G,sampleRatio=0.3,totalNum=130820):
    np.random.seed(123456)
    sampleNegNum=int(sampleRatio*totalNum)
    # get all positive edges
    all_edges = G.edges()
    all_edges = list(set(all_edges))
    sample_negEdges=[]
    nodes1=set()
    nodes2=set()
    for edge in all_edges:
        nodes1.add(edge[0])
        nodes2.add(edge[1])
    nodes1=list(nodes1)
    nodes2=list(nodes2)
    count=0
    while count<sampleNegNum:
        node1=random.choice(nodes1)
        node2=random.choice(nodes2)
        edge=(node1,node2)
        if edge not in all_edges and edge not in sample_negEdges:
            sample_negEdges.append(edge)
            count+=1
    return sample_negEdges
    
def mst_drop(G, drop_ratio=0.3):
    """randomly drop some edge that already exist, and keep connection.
    return: G: graph after process
            drop_pos: ndarray of droped edges
    """
    np.random.seed(123456)
    # get all positive edges
    all_pos_edges = G.edges()
    all_pos_edgeset = set(all_pos_edges)
    total = len(all_pos_edgeset)
    
    # generate minimum spanning tree's edges
    from networkx.algorithms import tree
    mst = tree.minimum_spanning_edges(G, algorithm='kruskal', data=False)
    edgelist = list(mst)
    edgeset = set(tuple(sorted(edge)) for edge in edgelist)
    
    # all_pos - mst_edgelist
    other_pos_edgeset = all_pos_edgeset.difference(edgeset)
    other_pos_edges = np.asarray(list(other_pos_edgeset))
    other = len(other_pos_edgeset)
    
    if other <= total * drop_ratio:
        drop_pos = other_pos_edges
        G.remove_edges_from(other_pos_edges)
    else:
        drop_pos_idx = np.random.choice(np.arange(other), int(total * drop_ratio), replace=False)
        drop_pos = other_pos_edges[drop_pos_idx]
        G.remove_edges_from(drop_pos)
    return G, drop_pos

def random_drop(G, drop_ratio=0.3):
    np.random.seed(123456)
    # get all positive edges
    all_pos_edges = list(set(G.edges()))
    drop_pos=[]
    #np.asarray(all_pos_edges)
    total = len(all_pos_edges)
    num_drop = total * drop_ratio
    index_list = [i for i in range(total)]
    index = random.sample(index_list, int(num_drop))
    for idx in index:
        drop_pos.append(all_pos_edges[idx])
    G.remove_edges_from(drop_pos)
    np.asarray(drop_pos)
    return G, drop_pos

In [28]:
datapath="./stanford_data/DG-AssocMiner_miner-disease-gene.tsv"
data=pd.read_csv(datapath,sep='\t',header=0)
diseaseID=data['# Disease ID']
geneID=data['Gene ID']
edges=[]
for i in range(data.shape[0]):
    edges.append((str(diseaseID[i]),str(geneID[i])))
dis_gene_network = nx.Graph() # 创建无向图
dis_gene_network.add_edges_from(edges)

In [29]:
saveSampleConnectivePath="./stanford_data/connectiveSample_edges0.3.txt"
neg_edges=sampleNegativeEdge(dis_gene_network,sampleRatio=0.3)
dis_gene_network_connective, pos_edges = mst_drop(dis_gene_network, drop_ratio=0.3)
pos_edges=list(map(lambda x:tuple(x),pos_edges))
save_edges(saveSampleConnectivePath,pos_edges,neg_edges)
# nx.write_edgelist(remainGraph, "network_edgelist/dg_sampledGraph_connective_adjlist.txt", delimiter=' ',data=False)
#nx.write_adjlist(remainGraph,"./network_edgelist/dg_sampledGraph_connective_adjlist.txt")
print(len(list(dis_gene_network_connective.edges)))
print(len(pos_edges))
print(len(neg_edges))

45653
14950
6407
39246


In [30]:
dis_gene_network = nx.Graph() # 创建无向图
dis_gene_network.add_edges_from(edges)
print(len(dis_gene_network.nodes))
saveSampleRandomPath="./stanford_data/randomSample_edges0.3.txt"
dis_gene_network_random, pos_edges_rand = random_drop(dis_gene_network, drop_ratio=0.3)
save_edges(saveSampleRandomPath,pos_edges_rand,neg_edges)
print(len(list(dis_gene_network_random.edges)))
print(len(pos_edges))
print(len(neg_edges))

7813
45653
14950
6407
39246


In [31]:
nx.write_adjlist(dis_gene_network_connective, "./stanford_data/dis_gene_network_connective.txt")
nx.write_adjlist(dis_gene_network_random, "./stanford_data/dis_gene_network_random.txt")