In [1]:
import sys
sys.path.append('../')

import networkx as nx
from node2vec import Node2Vec

from src.utils import load_embeddings

%load_ext autoreload
%autoreload 2

In [2]:
G = nx.read_gpickle('data/network.gpickle')
G.number_of_nodes(), G.number_of_edges()

(60789, 75647)

To search for node embeddings I use node2vec approach. I tried to use network SVD from network-sklearn, but it didn't work well (I got very low scaled vectors with values about 1e-20). node2vec gives more pleasant embeddings in terms of using dot product or cosine similarity. By the way it's not so fast (model trains for 3,5 minutes for graph with 60k vertices and 75k edges with some hyperparameters tuning)

In [4]:
def generate_node_embeddings(graph: nx.Graph):
    model = Node2Vec(graph, dimensions=64, num_walks=25, walk_length=25, temp_folder='temp/', workers=4)
    result = model.fit(window=5, min_count=1)
    return result

In [5]:
%%time
embeddings = generate_node_embeddings(G)

Computing transition probabilities: 100%|██████████| 60789/60789 [00:02<00:00, 21166.04it/s]

Generating walks (CPU: 1):   0%|          | 0/7 [00:00<?, ?it/s]

[A[A
Generating walks (CPU: 1):  29%|██▊       | 2/7 [00:21<00:54, 10.97s/it]

Generating walks (CPU: 1):  43%|████▎     | 3/7 [00:42<01:00, 15.14s/it]
[A

Generating walks (CPU: 1):  57%|█████▋    | 4/7 [01:02<00:49, 16.66s/it]
[A

Generating walks (CPU: 1):  71%|███████▏  | 5/7 [01:25<00:37, 18.94s/it]
[A

Generating walks (CPU: 1):  86%|████████▌ | 6/7 [01:44<00:19, 19.08s/it]
[A

Generating walks (CPU: 3): 100%|██████████| 6/6 [02:07<00:00, 21.29s/it]
Generating walks (CPU: 2): 100%|██████████| 6/6 [02:11<00:00, 21.91s/it]
Generating walks (CPU: 4): 100%|██████████| 6/6 [02:13<00:00, 22.25s/it]
Generating walks (CPU: 1): 100%|██████████| 7/7 [02:15<00:00, 19.41s/it]


CPU times: user 21min 36s, sys: 3.4 s, total: 21min 40s
Wall time: 7min 16s


In [6]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(0, 1)
(0, 24)
(0, 350192)
(1, 2)
(2, 3)


In [7]:
embeddings.wv.most_similar('0')

[('1', 0.9892374873161316),
 ('350192', 0.9839320778846741),
 ('24', 0.9814940094947815),
 ('350191', 0.9578383564949036),
 ('200921', 0.9544402360916138),
 ('2', 0.9531792998313904),
 ('200923', 0.937479555606842),
 ('3', 0.920913815498352),
 ('332388', 0.9163187742233276),
 ('350190', 0.9128226041793823)]

In [8]:
embeddings.save('models_data/embeddings64.data')

In [9]:
# join embeddings with corresponding nodes (as x feature)
for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=embeddings.wv[str(node)].copy())

In [10]:
nx.readwrite.write_gpickle(G, 'data/network64.gpickle')