In [1]:
from __future__ import division
from __future__ import print_function
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import islice
import pickle
import networkx as nx
import random
import gc
random.seed(32)
np.random.seed(32)

In [2]:
# 读取文件中的正负边
def read_edges(filename):
    pos_edges = set()
    neg_edges = set()
    with open(filename, 'r') as f:
        for line in f:
            words = line.split()
            node1 = words[0]
            node2 = words[1]
            label = words[2]
            if label == '1':
                pos_edges.add((node1, node2))
            else:
                neg_edges.add((node1, node2))
#     print("{}\n{}".format(len(nodeList1),len(nodeList2)))
    return list(pos_edges), list(neg_edges)

def read_embeddings(embedding_file):
    embeddings={}
    with open(embedding_file, 'r') as f:
        first_line=f.readline()
        node_nums,dimension=first_line.split()
        #print("{} {}".format(node_nums,dimension))
        for line in f:
            line=line.split()
            embeddings[line[0]]=line[1:]
    return embeddings,int(dimension)

def get_edge_features(embedding_file,edges_filename,feature_file):
    embeddings,embedding_dimension=read_embeddings(embedding_file)
    pos_edges,neg_edges=read_edges(edges_filename)
    #边的特征
    #提取正例特征
    pos_features=[]
    label=[]
    for edge in pos_edges:
        if edge[0][0]=='C' and len(edge[0])==8:
            disease_id,gene_id=edge
        else:
            gene_id,disease_id=edge
        #print(edge)
        pos_features.append(np.hstack((embeddings[disease_id],embeddings[gene_id])))
        label.append(1)

    #提取负例特征
    neg_features=[]
    for edge in neg_edges:
        if edge[0][0]=='C' and len(edge[0])==8:
            disease_id,gene_id=edge
        else:
            gene_id,disease_id=edge
        #print(edge)
        neg_features.append(np.hstack((embeddings[disease_id],embeddings[gene_id])))
        label.append(0)
    train_features=pos_features+neg_features
    #基于边的特征
    feature_columns=[str(i) for i in range(embedding_dimension*2)]
    train_data=pd.DataFrame(train_features,columns=feature_columns)
    train_data['label']=label
    train_data.to_csv(feature_file,index = False)
    return train_data

In [5]:
connectiveSample_edges="network_edgelist/connectiveSample_edges0.9.txt"
connectiveSample_embedding="embeddings/all_connectiveSample0.9_sdne128.txt"  
connectiveSample_feature="train_data/all_connectiveSample0.9_sdne128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)

57358 128


In [4]:
#批量提取网络表征特征
negRatio = [0.5,1,1.5,2,2.5,3,3.5,4,4.5]
posEdgeFile="network_edgelist/connectiveSample_edges0.1.txt"
filePredix="network_edgelist/connectiveSample_negPosRatio"
embeddingMethhods=["node2vec","deepwalk","line","sdne"]
for method in embeddingMethhods:
    for sampleRatio in negRatio:
        edgePath=filePredix+str(sampleRatio)+".txt"
        embeddingFile="embeddings/all_connectiveSample0.1_"+method+"128.txt" 
        featureSavePath="train_data/all_connectiveSample_negPosRatio"+str(sampleRatio)+"_"+method+"128.csv"
        train_data=get_edge_features(embeddingFile,edgePath,featureSavePath)

In [4]:
#批量提取网络表征特征
posSampleRatio = [x/10 for x in range(1, 10)]
embeddingMethhods=["node2vec","deepwalk","line","sdne"]
filePredix = "network_edgelist/connectiveSample_edges"
for method in embeddingMethhods:
    for sampleRatio in posSampleRatio:
        connectiveSample_edges="network_edgelist/connectiveSample_edges" + str(sampleRatio)+".txt"
        connectiveSample_embedding="embeddings/all_connectiveSample"+str(sampleRatio)+"_"+method+"128.txt"  

        connectiveSample_feature="train_data/all_connectiveSample"+str(sampleRatio)+"_"+method+"128.csv"
        train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)

57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128
57358 128


FileNotFoundError: [Errno 2] No such file or directory: 'embeddings/all_connectiveSample0.9_sdne128.txt'

In [3]:
#node2vec
connectiveSample_edges="network_edgelist/connectiveSample_edges0.5.txt"
connectiveSample_embedding="embeddings/all_connectiveSample_node2vec128.txt"
connectiveSample_feature="train_data/all_connectiveSample_node2vec128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
randomSample_edges="network_edgelist/randomSample_edges0.5.txt"
randomSample_embedding="embeddings/all_randomSample_node2vec128.txt"
randomSample_feature="train_data/all_randomSample_node2vec128.csv"
train_data=get_edge_features(randomSample_embedding,randomSample_edges,randomSample_feature)

57358 128
57358 128


In [4]:
#deepwalk
connectiveSample_edges="network_edgelist/connectiveSample_edges0.5.txt"
connectiveSample_embedding="embeddings/all_connectiveSample_deepwalk128.txt"
connectiveSample_feature="train_data/all_connectiveSample_deepwalk128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="network_edgelist/randomSample_edges0.5.txt"
embedding_file_random="embeddings/all_randomSample_deepwalk128.txt"
feature_file_random="train_data/all_randomSample_deepwalk128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

57358 128
57358 128


In [5]:
#line
connectiveSample_edges="network_edgelist/connectiveSample_edges0.5.txt"
connectiveSample_embedding="embeddings/all_connectiveSample_line128.txt"
connectiveSample_feature="train_data/all_connectiveSample_line128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="network_edgelist/randomSample_edges0.5.txt"
embedding_file_random="embeddings/all_randomSample_line128.txt"
feature_file_random="train_data/all_randomSample_line128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

57358 128
57358 128


In [17]:
#hope(超内存)
# connectiveSample_edges="network_edgelist/connectiveSample_edges0.3.txt"
# connectiveSample_embedding="embeddings/all_connectiveSample_hope128.txt"
# connectiveSample_feature="train_data/all_connectiveSample_hope128.csv"
# train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
# edges_filename_random="network_edgelist/randomSample_edges0.3.txt"
# embedding_file_random="embeddings/all_randomSample_hope128.txt"
# feature_file_random="train_data/all_randomSample_hope128.csv"
# train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

22021 128
22021 128


In [6]:
#sdne
connectiveSample_edges="network_edgelist/connectiveSample_edges0.5.txt"
connectiveSample_embedding="embeddings/all_connectiveSample_sdne128.txt"
connectiveSample_feature="train_data/all_connectiveSample_sdne128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="network_edgelist/randomSample_edges0.5.txt"
embedding_file_random="embeddings/all_randomSample_sdne128.txt"
feature_file_random="train_data/all_randomSample_sdne128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

57358 128
57358 128


In [None]:
# #lap
# connectiveSample_edges="network_edgelist/connectiveSample_edges0.3.txt"
# connectiveSample_embedding="embeddings/all_connectiveSample_lap128.txt"
# connectiveSample_feature="train_data/all_connectiveSample_lap128.csv"
# train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
# edges_filename_random="network_edgelist/randomSample_edges0.3.txt"
# embedding_file_random="embeddings/all_randomSample_lap128.txt"
# feature_file_random="train_data/all_randomSample_lap128.csv"
# train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

In [14]:
#gf
connectiveSample_edges="network_edgelist/connectiveSample_edges0.5.txt"
connectiveSample_embedding="embeddings/dgsp_connectiveSample_gf128.txt"
connectiveSample_feature="train_data/dgsp_connectiveSample_gf128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="network_edgelist/randomSample_edges0.5.txt"
embedding_file_random="embeddings/dgsp_randomSample_gf128.txt"
feature_file_random="train_data/dgsp_randomSample_gf128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

39425 128
39425 128


In [49]:
#-----------下面是斯坦福公开数据集-----------------------------------------------------------

In [68]:
#node2vec
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_node2vec128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_node2vec128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_node2vec128.txt"
feature_file_random="stanford_data/dg_randomSample_node2vec128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 128
7813 128


In [52]:
#deepwalk
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_deepwalk128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_deepwalk128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_deepwalk128.txt"
feature_file_random="stanford_data/dg_randomSample_deepwalk128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 128
7813 128


In [57]:
#line
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_line128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_line128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_line128.txt"
feature_file_random="stanford_data/dg_randomSample_line128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 64
7813 64


In [54]:
#hope
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_hope128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_hope128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_hope128.txt"
feature_file_random="stanford_data/dg_randomSample_hope128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 128
7813 128


In [55]:
#sdne
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_sdne128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_sdne128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_sdne128.txt"
feature_file_random="stanford_data/dg_randomSample_sdne128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 128
7813 128


In [56]:
#lap
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_lap128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_lap128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_lap128.txt"
feature_file_random="stanford_data/dg_randomSample_lap128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 128
7813 128


In [50]:
#gf
connectiveSample_edges="stanford_data/connectiveSample_edges0.3.txt"
connectiveSample_embedding="stanford_data/dg_connectiveSample_gf128.txt"
connectiveSample_feature="stanford_data/dg_connectiveSample_gf128.csv"
train_data=get_edge_features(connectiveSample_embedding,connectiveSample_edges,connectiveSample_feature)
edges_filename_random="stanford_data/randomSample_edges0.3.txt"
embedding_file_random="stanford_data/dg_randomSample_gf128.txt"
feature_file_random="stanford_data/dg_randomSample_gf128.csv"
train_data=get_edge_features(embedding_file_random,edges_filename_random,feature_file_random)

7813 128
7813 128


In [None]:
G_connect=nx.read_adjlist("network_edgelist/all_network_adjlist_connective.txt")
trainEdgeFile_connect="network_edgelist/connectiveSample_edges0.5.txt"
savePath_connect="train_data/struct_connective_traindata.csv"
struct_train_data_connect=extract_struct_feature(G_connect,trainEdgeFile_connect,savePath_connect)

G_random=nx.read_adjlist("network_edgelist/all_network_adjlist_random.txt")
trainEdgeFile_random="network_edgelist/randomSample_edges0.5.txt"
savePath_random="train_data/struct_random_traindata.csv"
struct_train_data_random=extract_struct_feature(G_random,trainEdgeFile_random,savePath_random)

In [12]:
print("begin")
# edges_filename="network_edgelist/train_edges_1.0.txt"
edges_filename="network_edgelist/train_edges_connective.txt"
# embedding_file="embeddings/all_network_node2vec128.txt"
# feature_file="train_data/allnet_node2vec128_train_data.csv"
# train_data=get_edge_features(embedding_file,edges_filename,feature_file)

# embedding_file="embeddings/all_network_deepwalk128.txt"

# feature_file="train_data/allnet_deepwalk128_train_data.csv"
embedding_file="embeddings/dg_sampledGraph_connective_adjlist_node2vec128.txt"
feature_file="train_data/dg_connective_node2vec128_traindata.csv"
train_data=get_edge_features(embedding_file,edges_filename,feature_file)

# embedding_file="embeddings/all_network_hope128.txt"
# feature_file="train_data/allnet_hope128_train_data.csv"
# train_data=get_edge_features(embedding_file,edges_filename,feature_file)

# embedding_file="embeddings/all_network_line128.txt"
# feature_file="train_data/allnet_line128_train_data.csv"
# train_data=get_edge_features(embedding_file,edges_filename,feature_file)

# embedding_file="embeddings/all_network_sdne128.txt"
# feature_file="train_data/allnet_sdne128_train_data.csv"
# train_data=get_edge_features(embedding_file,edges_filename,feature_file)

begin
22021 128


In [4]:
edges_filename="network_edgelist/train_edges_1.0.txt"
pos_edges,neg_edges=read_edges(edges_filename)

embedding_file="embeddings/all_network_node2vec128.txt"
count=0
embeddings={}
# nodes=set()
with open(embedding_file, 'r') as f:
    first_line=f.readline()
    node_nums,embedding_dimension=first_line.split()
    print("{} {}".format(node_nums,embedding_dimension))
    for line in f:
        line=line.split()
        embeddings[line[0]]=line[1:]
        count+=1
#         node=line[0]
#         embedding=line[1:]
#         print(embedding)
#         nodes.add(node)
#         print(node)

# print(embeddings['31812'])
print(count)

#边的特征
#提取正例特征
pos_features=[]
label=[]
for disease_id,gene_id in pos_edges:
    pos_features.append(np.hstack((embeddings[str(disease_id)],embeddings[str(gene_id)])))
    label.append(1)

#提取负例特征
neg_features=[]
for disease_id,gene_id in neg_edges:
    neg_features.append(np.hstack((embeddings[str(disease_id)],embeddings[str(gene_id)])))
    label.append(0)
train_features=pos_features+neg_features

In [29]:
#边的特征
#提取正例特征
pos_features=[]
label=[]
for disease_id,gene_id in pos_edges:
    pos_features.append(np.hstack((embeddings[str(disease_id)],embeddings[str(gene_id)])))
    label.append(1)

#提取负例特征
neg_features=[]
for disease_id,gene_id in neg_edges:
    neg_features.append(np.hstack((embeddings[str(disease_id)],embeddings[str(gene_id)])))
    label.append(0)
train_features=pos_features+neg_features

In [30]:
print("begin")
print('{}:{}'.format(len(pos_features),len(pos_features[0])))
print('{}:{}'.format(len(neg_features),len(neg_features[0])))
print(len(label))
print(np.array(train_features).shape)

begin
130820:256
130820:256
261640
(261640, 256)


In [None]:
print("begin")
#基于边的特征
feature_columns=['n2v_'+str(i) for i in range(representation_size*2)]
train_data=pd.DataFrame(train_features,columns=feature_columns)
train_data['label']=label
train_data.to_csv("train_data/allnet128_train_data.csv",index = False)
train_data.head()