In [68]:
import networkx as nx
import numpy as np
from tqdm import tqdm
from gensim.models import word2vec
from node2vec import Node2Vec
from sklearn.metrics import roc_auc_score

In [34]:
def walkOneTime(g, start_node, walk_length):
    walk = [str(start_node)]  # 初始化游走序列
    for _ in range(walk_length):  # 最大长度范围内进行采样
        current_node = int(walk[-1])
        if g.has_node(current_node):
            successors = list(g.successors(current_node)) # graph.successor: 获取当前节点的后继邻居
            if len(successors) > 0:
                next_node = np.random.choice(successors, 1)
                walk.extend([str(n) for n in next_node])
    return walk

def getDeepwalkSeqs(g, walk_length, num_walks):
    seqs=[]
    for _ in tqdm(range(num_walks)):
        start_node = np.random.choice(list(g.node))
        w = walkOneTime(g,start_node, walk_length)
        seqs.append(w)
    return seqs

def deepwalk(g, dimensions, walk_length, num_walks, min_count):
    seqs = getDeepwalkSeqs(g, walk_length = walk_length, num_walks = num_walks)
    model = word2vec.Word2Vec(seqs, vector_size = dimensions, min_count = min_count)
    return model

In [84]:
g = nx.fast_gnp_random_graph(n = 500, p = 0.5, directed=True) #快速随机生成一个有向图

In [85]:
model_dw = deepwalk(g, dimensions = 10, walk_length = 20, num_walks = 1000, min_count = 3)

node2vec = Node2Vec(g, dimensions=10, walk_length=20, num_walks=100, p=0.3,q=0.7,workers=4)
model_n2v = node2vec.fit()

100%|██████████████████████████████████████| 1000/1000 [00:01<00:00, 738.70it/s]


Computing transition probabilities:   0%|          | 0/500 [00:00<?, ?it/s]

Generating walks (CPU: 4): 100%|██████████| 25/25 [00:15<00:00,  1.60it/s]

In [86]:
print(model_dw.wv.most_similar('2',topn=3)) # 观察与节点2最相近的三个节点

[('213', 0.997041642665863), ('264', 0.996917724609375), ('42', 0.9967489838600159)]


In [87]:
print(model_n2v.wv.most_similar('2',topn=3))

[('269', 0.9497026205062866), ('128', 0.9482324123382568), ('198', 0.9079732894897461)]








In [36]:
node = []
edge_train = []
edge_test = []
edge_exit = []
count = 0
with open('facebook_combined.txt', 'r') as f:
    data = f.readlines()
    for line in data:
        line = tuple(line.replace('\n', '').split(' '))
        edge_exit.append(line)
        
        if line[0] not in node:
            node.append(line[0])
        
        if line[1] not in node:
            node.append(line[1])
            
        if(count%5 == 0):
            edge_test.append(line)
            count+=1
        else:
            edge_train.append(line)
            count+=1

In [37]:
G = nx.DiGraph()
G.add_nodes_from(node)
G.add_edges_from(edge_train)

In [38]:
G_all = nx.DiGraph()
G_all.add_nodes_from(node)
G_all.add_edges_from(edge_exit)

In [39]:
edge_not_exit = list(nx.non_edges(G_all))

print(len(edge_train))
print(len(edge_test))
print(len(edge_not_exit))

70587
17647
16221248


In [64]:
model = deepwalk(G, dimensions = 128, walk_length = 80, num_walks = 10000, min_count = 1)

100%|███████████████████████████████████| 10000/10000 [00:09<00:00, 1041.63it/s]


In [65]:
print(model.wv.most_similar('0', topn=5))

[('1408', 0.2991068363189697), ('462', 0.2932160496711731), ('3427', 0.28582262992858887), ('3541', 0.2841106653213501), ('2483', 0.2684030830860138)]


In [50]:
len_not = len(edge_not_exit)
len_test = len(edge_test)
y_true = np.array([])
y_scores = np.array([])

for i in range(len_test):
    y_true = np.append(y_true, 1)
    sim = model.wv.similarity(edge_test[i][0], edge_test[i][1])
    y_scores = np.append(y_scores, sim)

In [51]:
j=0
while(j < len_not):
    y_true = np.append(y_true, 0)
    sim = model.wv.similarity(edge_not_exit[i][0], edge_not_exit[i][1])
    y_scores = np.append(y_scores, sim)
    j += 500

In [54]:
print(roc_auc_score(y_true, y_scores))

0.34419448064826885
