In [1]:
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
import pickle
import networkx as nx
# import openne
import network_embedding.classify
import network_embedding.gf
import network_embedding.graph
import network_embedding.grarep
import network_embedding.hope
import network_embedding.lap
import network_embedding.line
import network_embedding.lle
import network_embedding.node2vec
import network_embedding.sdne
import network_embedding.tadw
import network_embedding.walker
from network_embedding.graph import *
# from network_embedding import *
# import openne.node2vec
# from openne.graph import *
import random
random.seed(32)
np.random.seed(32)

In [10]:
#读取文件中的正负边
def read_edges(filename):
    pos_edges=set()
    neg_edges=set()
    with open(filename, 'r') as f:
        for line in f:
            words = line.split()
            node1 = words[0]
            node2 = words[1]
            label = words[2]
            if label=='1':
                pos_edges.add((node1,node2))
            else:
                neg_edges.add((node1,node2))
#     print("{}\n{}".format(len(nodeList1),len(nodeList2)))
    return list(pos_edges), list(neg_edges)


# # 读取文件中的正负边
# def read_edges(filename):
#     pos_edges = set()
#     neg_edges = set()
#     with open(filename, 'r') as f:
#         for line in f:
#             words = line.split()
#             node1 = words[0]
#             node2 = words[1]
#             label = words[2]
#             if label == '1':
#                 pos_edges.add((node1, node2))
#             else:
#                 neg_edges.add((node1, node2))
# #     print("{}\n{}".format(len(nodeList1),len(nodeList2)))
#     return list(pos_edges), list(neg_edges)


# 提取图节点的node2vec表示向量
def extract_node2vec(edgelist_file, node2vec_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = node2vec_params['representation_size']  # 表征向量的长度
    number_walks = node2vec_params['number_walks']  # 每个节点随机游走序列数量18
    walk_length = node2vec_params['walk_length']  # 每个随机游走序列的长度100
    workers = node2vec_params['workers']  # 并行数量
    window_size = node2vec_params['window_size']  # skip-gram提取词的上下文数量16
    # node2vec参数
    q = node2vec_params['q']  # {0.25,0.50,1,2,4}
    p = node2vec_params['p']
    # deepwalk只需要令p=1,q=1
    # 网络表征模型
    embeddings = network_embedding.node2vec.Node2vec(graph=network, path_length=walk_length,
                                          num_paths=number_walks, dim=representation_size,
                                          workers=workers, p=p, q=q, window=window_size)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的LINE表示向量


def extract_LINE(edgelist_file, LINE_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = LINE_params['representation_size']
    order = LINE_params['order']
    epochs = LINE_params['epochs']
    # 网络表征模型
    embeddings = network_embedding.line.LINE(
        graph=network, epoch=epochs, rep_size=representation_size, order=order)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的GraRep表示向量


def extract_GraRep(edgelist_file, GraRep_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = GraRep_params['representation_size']
    kstep = GraRep_params['kstep']
    # 网络表征模型
    embeddings = network_embedding.grarep.GraRep(
        graph=network, Kstep=kstep, dim=representation_size)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的LLE表示向量


def extract_LLE(edgelist_file, LLE_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = LLE_params['representation_size']
    # 网络表征模型
    embeddings = network_embedding.lle.LLE(graph=network, d=representation_size)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的HOPE表示向量


def extract_HOPE(edgelist_file, HOPE_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = HOPE_params['representation_size']
    # 网络表征模型
    embeddings = network_embedding.hope.HOPE(graph=network, d=representation_size)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的SDNE表示向量


def extract_SDNE(edgelist_file, SDNE_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)

    encoder_layer_list = ast.literal_eval(SDNE_params['encoder_list'])
    representation_size = SDNE_params['representation_size']
    epochs = SDNE_params['epochs']
    bs = SDNE_params['bs']
    lr = SDNE_params['lr']
    nu1 = SDNE_params['nu1']
    nu2 = SDNE_params['nu2']
    beta = SDNE_params['beta']
    alpha = SDNE_params['alpha']
    # 网络表征模型
    embeddings = network_embedding.sdne.SDNE(graph=network, encoder_layer_list=encoder_layer_list,
                                  alpha=alpha, beta=beta, nu1=nu1, nu2=nu2,
                                  batch_size=bs, epoch=epochs, learning_rate=lr)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的LaplacianEigenmaps表示向量
def extract_LaplacianEigenmaps(edgelist_file, LaplacianEigenmaps_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = LaplacianEigenmaps_params['representation_size']
    # 网络表征模型
    embeddings = network_embedding.lap.LaplacianEigenmaps(
        graph=network, rep_size=representation_size)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

# 提取图节点的GraphFactorization表示向量
def extract_GraphFactorization(edgelist_file, GraphFactorization_params, save_path):
    #     input_path="network_edgelist/all_network_edgelist.txt"
    network = Graph()
    network.read_edgelist(filename=edgelist_file)
    representation_size = GraphFactorization_params['representation_size']
    epochs = GraphFactorization_params['epochs']
    weight_decay = GraphFactorization_params['weight-decay']
    lr = GraphFactorization_params['lr']
    # 网络表征模型
    embeddings = network_embedding.gf.GraphFactorization(graph=network, rep_size=representation_size,
                                              epoch=epochs, learning_rate=lr, weight_decay=weight_decay)
    # 保存
    embeddings.save_embeddings(save_path)
    # 提取节点及其表示向量
    embeddings = embeddings.vectors
    return embeddings

In [None]:
#1
node2vec_params={
    'representation_size':128, #表征向量的长度
    'number_walks':40,  #每个节点随机游走序列数量18
    'walk_length':10,   #每个随机游走序列的长度100
    'workers':8,        #并行数量
    'window_size':10,   #skip-gram提取词的上下文数量16
    #node2vec参数
    'q':0.25, # {0.25,0.50,1,2,4}
    'p':0.25,
    #deepwalk只需要令p=1,q=1
}
#2
deepwalk_params={    
    'representation_size':128, #表征向量的长度
    'number_walks':40,  #每个节点随机游走序列数量18
    'walk_length':10,   #每个随机游走序列的长度100
    'workers':8,        #并行数量
    'window_size':10,   #skip-gram提取词的上下文数量16
    #node2vec参数
    'q':1, # {0.25,0.50,1,2,4}
    'p':1,
    #deepwalk只需要令p=1,q=1
}
#3
LINE_params={  
    'representation_size':128, #表征向量的长度
    'order':3,   
    'epochs':8,
}
#4
GraRep_params={    
    'representation_size':128, #表征向量的长度
    'kstep':8,
}
#5
LLE_params={    
    'representation_size':128, #表征向量的长度
}
#6
HOPE_params={    
    'representation_size':128, #表征向量的长度
}
#7
SDNE_params={  
    'encoder-list':'[1000, 128]', #表征向量的长度
    'bs':200, 
    'epochs':8,
    'lr':0.01,
    'nu1':1e-5,  
    'nu2':1e-4, 
    'beta':5.,  
    'alpha':1e-6, 
}
#8
LaplacianEigenmaps_params={    
    'representation_size':128, #表征向量的长度
}
#9
GraphFactorization_params={    
    'representation_size':128, #表征向量的长度
    'epochs':8,
    'weight-decay':5e-4,
    'lr':0.01,
}
input_path="network_edgelist/all_network_edgelist.txt"

In [12]:
# input_path="network_edgelist/all_network_edgelist.txt"
# node2vec_embeddings_file="embeddings/all_network_node2vec128.txt"
# node2vec_embeddings=extract_node2vec(input_path, node2vec_params, node2vec_embeddings_file)

In [None]:
print("begin")
# #2
# deepwalk_embedding_file="embeddings/all_network_deepwalk128.txt"
# deepwalk_embeddings=extract_node2vec(input_path, deepwalk_params, deepwalk_embedding_file)

# #3
# LINE_embedding_file="embeddings/all_network_LINE128.txt"
# LINE_embeddings=extract_LINE(input_path, LINE_params, LINE_embedding_file)
# #4内存不够
# GraRep_embedding_file="embeddings/all_network_GraRep128.txt"
# GraRep_embeddings=extract_GraRep(input_path, GraRep_params, GraRep_embedding_file)
#5
LLE_embedding_file="embeddings/all_network_LLE128.txt"
LLE_embeddings=extract_LLE(input_path, LLE_params, LLE_embedding_file)
# #6
# HOPE_embedding_file="embeddings/all_network_HOPE128.txt"
# HOPE_embeddings=extract_HOPE(input_path, HOPE_params, HOPE_embedding_file)
# #7
# SDNE_embedding_file="embeddings/all_network_SDNE128.txt"
# SDNE_embeddings=extract_SDNE(input_path, SDNE_params, SDNE_embedding_file)
# #8
# LaplacianEigenmaps_embedding_file="embeddings/all_network_LaplacianEigenmaps128.txt"
# LaplacianEigenmaps_embeddings=extract_LaplacianEigenmaps(input_path, LaplacianEigenmaps_params, LaplacianEigenmaps_embedding_file)
# #9
# GraphFactorization_embedding_file="embeddings/all_network_GraphFactorization128.txt"
# GraphFactorization_embeddings=extract_GraphFactorization(input_path, GraphFactorization_params, GraphFactorization_embedding_file)

begin
  (0, 0)	1.0
  (0, 1)	-0.02857142857142857
  (0, 2)	-0.02857142857142857
  (0, 3)	-0.02857142857142857
  (0, 4)	-0.02857142857142857
  (0, 5)	-0.02857142857142857
  (0, 6)	-0.02857142857142857
  (0, 7)	-0.02857142857142857
  (0, 8)	-0.02857142857142857
  (0, 9)	-0.02857142857142857
  (0, 10)	-0.02857142857142857
  (0, 11)	-0.02857142857142857
  (0, 12)	-0.02857142857142857
  (0, 13)	-0.02857142857142857
  (0, 14)	-0.02857142857142857
  (0, 15)	-0.02857142857142857
  (0, 16)	-0.02857142857142857
  (0, 17)	-0.02857142857142857
  (0, 18)	-0.02857142857142857
  (0, 19)	-0.02857142857142857
  (0, 20)	-0.02857142857142857
  (0, 21)	-0.02857142857142857
  (0, 22)	-0.02857142857142857
  (0, 23)	-0.02857142857142857
  (0, 24)	-0.02857142857142857
  :	:
  (57345, 57345)	1.0
  (57346, 57346)	1.0
  (57346, 57347)	-1.0
  (57347, 57346)	-1.0
  (57347, 57347)	1.0
  (57348, 57348)	1.0
  (57348, 57349)	-1.0
  (57349, 57348)	-1.0
  (57349, 57349)	1.0
  (57350, 57350)	1.0
  (57350, 57351)	-1.0
  (5

In [None]:
# # dis_gene_network.write
# input_path="network_edgelist/all_network_edgelist.txt"
# all_network=Graph()
# all_network.read_adjlist(filename=input_path)
# #网络表征模型
# node2vec_all = openne.node2vec.Node2vec(graph=all_network, path_length=walk_length,
#                                   num_paths=number_walks, dim=representation_size,
#                                   workers=workers, p=p, q=q, window=window_size)
# node2vec_all.save_embeddings("embeddings/all_network_node2vec128.txt")

# node2vec_all=node2vec_all.vectors
# print(type(node2vec_all))
# nodes=np.array(list(node2vec_all.keys()))
# all_network_node2vec_vecs=np.array(list(node2vec_all.values()))
# # for node, vec in disease_network_node2vec.items():
# #     diseases.append(node)
# #     vecs.append(vecs)
# print(nodes.shape)
# print(all_network_node2vec_vecs.shape)

In [15]:
# print("begin")
# # dis_gene_network.write
# input_path="network_edgelist/dg_network_edgelist.txt"
# dg_network=Graph()
# dg_network.read_edgelist(filename=input_path)
# #网络表征模型
# node2vec_dg = openne.node2vec.Node2vec(graph=dg_network, path_length=walk_length,
#                                   num_paths=number_walks, dim=representation_size,
#                                   workers=workers, p=p, q=q, window=window_size)
# node2vec_dg.save_embeddings("embeddings/dg_network_node2vec128.txt")

begin
Preprocess transition probs...
Walk iteration:
1 / 40
2 / 40
3 / 40
4 / 40
5 / 40
6 / 40
7 / 40
8 / 40
9 / 40
10 / 40
11 / 40
12 / 40
13 / 40
14 / 40
15 / 40
16 / 40
17 / 40
18 / 40
19 / 40
20 / 40
21 / 40
22 / 40
23 / 40
24 / 40
25 / 40
26 / 40
27 / 40
28 / 40
29 / 40
30 / 40
31 / 40
32 / 40
33 / 40
34 / 40
35 / 40
36 / 40
37 / 40
38 / 40
39 / 40
40 / 40
Learning representation...


In [6]:
# edges_filename="network_edgelist/train_edges_func.txt"
# pos_edges,neg_edges=read_edges(edges_filename)

In [7]:
# #边的特征
# #提取正例特征
# pos_features=[]
# label=[]
# for disease_id,gene_id in pos_edges:
#     pos_features.append(np.hstack((node2vec_all[str(disease_id)],node2vec_all[str(gene_id)])))
#     label.append(1)

# #提取负例特征
# neg_features=[]
# for disease_id,gene_id in neg_edges:
#     neg_features.append(np.hstack((node2vec_all[str(disease_id)],node2vec_all[str(gene_id)])))
#     label.append(0)
# train_features=pos_features+neg_features

In [11]:
# print("begin")
# print('{}:{}'.format(len(pos_features),len(pos_features[0])))
# print('{}:{}'.format(len(neg_features),len(neg_features[0])))
# print(len(label))
# print(np.array(train_features).shape)

begin
130820:256
156984:256
287804
(287804, 256)


In [13]:
# print("begin")
# #基于边的特征
# feature_columns=['n2v_'+str(i) for i in range(representation_size*2)]
# train_data=pd.DataFrame(train_features,columns=feature_columns)
# train_data['label']=label
# train_data.to_csv("train_data/allnet128_train_data.csv",index = False)
# train_data.head()

begin


Unnamed: 0,n2v_0,n2v_1,n2v_2,n2v_3,n2v_4,n2v_5,n2v_6,n2v_7,n2v_8,n2v_9,...,n2v_247,n2v_248,n2v_249,n2v_250,n2v_251,n2v_252,n2v_253,n2v_254,n2v_255,label
0,-0.003202,-0.140782,0.155315,0.064413,0.045403,0.118346,-0.082164,0.009738,0.251382,-0.216391,...,-0.111878,-0.026656,-0.069604,0.257554,0.373072,-0.187672,0.245287,0.043335,0.266498,1
1,-0.15818,-0.074107,0.3851,-0.023721,-0.035587,0.071516,-0.077766,0.027596,0.282734,-0.159644,...,-0.150741,0.159942,-0.075783,0.080754,0.058067,0.001418,-0.050465,0.223759,0.233387,1
2,-0.162158,-0.102947,0.204488,0.027205,0.012591,0.049558,-0.16542,0.060486,0.302477,-0.346868,...,-0.122516,0.09653,-0.06612,-0.03669,0.181183,-0.149756,-0.092061,0.234985,0.311093,1
3,-0.074494,-0.068874,0.172986,-0.024724,0.171737,0.097275,-0.150305,0.180905,0.078419,-0.20296,...,0.009005,0.020131,-0.045608,0.035811,0.128828,-0.037315,-0.177486,0.178492,0.066715,1
4,-0.172494,-0.1654,0.17508,-0.01054,0.000172,-0.03047,-0.157295,0.086059,0.301881,-0.29142,...,-0.05276,0.127906,0.037284,0.099079,0.230483,-0.084454,-0.217913,0.381856,0.31146,1
