In [1]:
import networkx as nx
import node2vec
from node2vec import Node2Vec

from utils import *

%load_ext autoreload
%autoreload 2

In [2]:
G = nx.read_gpickle('data/network.gpickle')
G.number_of_nodes(), G.number_of_edges()

(60789, 75647)

To search for node embeddings I use node2vec approach. I tried to use network SVD from network-sklearn, but it didn't work well (I got very low scaled vectors with values about 1e-20). node2vec gives more pleasant embeddings in terms of using dot product or cosine similarity. By the way it's not so fast (model trains for 3,5 minutes for graph with 60k vertices and 75k edges with some hyperparameters tuning)

In [7]:
def generate_node_embeddings(graph: nx.Graph):
    model = Node2Vec(graph, dimensions=128, num_walks=30, walk_length=30, temp_folder='temp/', workers=2)
    result = model.fit(window=5, min_count=1, batch_words=5)
    return result

In [8]:
%%time
embeddings = generate_node_embeddings(G)

Computing transition probabilities:   0%|          | 0/60789 [00:00<?, ?it/s]

Generating walks (CPU: 2):   0%|          | 0/15 [00:00<?, ?it/s]
Generating walks (CPU: 2):  13%|█▎        | 2/15 [00:13<01:25,  6.59s/it]
Generating walks (CPU: 2):  20%|██        | 3/15 [00:26<01:52,  9.40s/it][A
Generating walks (CPU: 2):  27%|██▋       | 4/15 [00:39<01:57, 10.64s/it][A
Generating walks (CPU: 2):  33%|███▎      | 5/15 [00:52<01:55, 11.58s/it][A
Generating walks (CPU: 2):  40%|████      | 6/15 [01:05<01:48, 12.06s/it][A
Generating walks (CPU: 2):  47%|████▋     | 7/15 [01:18<01:39, 12.43s/it][A
Generating walks (CPU: 2):  53%|█████▎    | 8/15 [01:31<01:27, 12.48s/it][A
Generating walks (CPU: 2):  60%|██████    | 9/15 [01:44<01:16, 12.82s/it][A
Generating walks (CPU: 2):  67%|██████▋   | 10/15 [01:57<01:04, 12.86s/it][A
Generating walks (CPU: 2):  73%|███████▎  | 11/15 [02:11<00:51, 12.95s/it][A
Generating walks (CPU: 2):  80%|████████  | 12/15 [02:26<00:40, 13.62s/it][A
Generating walks (CPU: 2):  87%|████████▋ | 13/15 [02:39<00:27, 13.52s/it][A
Generating

CPU times: user 46min 51s, sys: 2min 21s, total: 49min 13s
Wall time: 24min 59s


In [9]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(0, 1)
(0, 24)
(0, 350192)
(1, 2)
(2, 3)


In [10]:
embeddings.wv.most_similar('0')

[('1', 0.9895632266998291),
 ('350192', 0.9811784029006958),
 ('24', 0.9749113321304321),
 ('2', 0.9552937746047974),
 ('350191', 0.9526708722114563),
 ('200921', 0.9499428272247314),
 ('200923', 0.9229863882064819),
 ('332388', 0.9122338891029358),
 ('3', 0.9114905595779419),
 ('350190', 0.9056087732315063)]

In [11]:
embeddings.save('model_data/embeddings.data')

In [12]:
# join embeddings with corresponding nodes (as x feature)
for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=embeddings.wv[str(node)].copy())

In [13]:
nx.readwrite.write_gpickle(G, 'data/network.gpickle')