In [11]:
import sys
sys.path.append('../')

import networkx as nx
from node2vec import Node2Vec

from src.utils import load_embeddings

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
G = nx.read_gpickle('data/network.gpickle')
G.number_of_nodes(), G.number_of_edges()

(60789, 75647)

To search for node embeddings I use node2vec approach. I tried to use network SVD from network-sklearn, but it didn't work well (I got very low scaled vectors with values about 1e-20). node2vec gives more pleasant embeddings in terms of using dot product or cosine similarity. By the way it's not so fast (model trains for 3,5 minutes for graph with 60k vertices and 75k edges with some hyperparameters tuning)

In [4]:
def generate_node_embeddings(graph: nx.Graph):
    model = Node2Vec(graph, dimensions=128, num_walks=25, walk_length=25, temp_folder='temp/', workers=3)
    result = model.fit(window=5, min_count=1)
    return result

In [5]:
%%time
embeddings = generate_node_embeddings(G)

Computing transition probabilities:   0%|          | 0/60789 [00:00<?, ?it/s]


Generating walks (CPU: 2):  25%|██▌       | 2/8 [00:13<00:41,  6.85s/it]
Generating walks (CPU: 2):  38%|███▊      | 3/8 [00:28<00:49, 10.00s/it]
Generating walks (CPU: 2):  50%|█████     | 4/8 [00:43<00:48, 12.07s/it]
[A
Generating walks (CPU: 2):  62%|██████▎   | 5/8 [00:59<00:40, 13.46s/it]
Generating walks (CPU: 2):  75%|███████▌  | 6/8 [01:14<00:28, 14.04s/it]
Generating walks (CPU: 2):  88%|████████▊ | 7/8 [01:29<00:14, 14.36s/it]
Generating walks (CPU: 2): 100%|██████████| 8/8 [01:47<00:00, 15.20s/it]
Generating walks (CPU: 2): 100%|██████████| 8/8 [02:02<00:00, 15.33s/it]
Generating walks (CPU: 3): 100%|██████████| 8/8 [02:06<00:00, 15.76s/it]
Generating walks (CPU: 1): 100%|██████████| 9/9 [02:08<00:00, 14.26s/it]


CPU times: user 25min 32s, sys: 3.13 s, total: 25min 35s
Wall time: 10min 3s


In [6]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(0, 1)
(0, 24)
(0, 350192)
(1, 2)
(2, 3)


In [7]:
embeddings.wv.most_similar('0')

[('1', 0.9945969581604004),
 ('350192', 0.98829585313797),
 ('24', 0.9796374440193176),
 ('2', 0.9795376658439636),
 ('200921', 0.9630741477012634),
 ('350191', 0.9622015953063965),
 ('3', 0.9482586979866028),
 ('200923', 0.9421999454498291),
 ('332388', 0.9389787316322327),
 ('211643', 0.935128390789032)]

In [8]:
embeddings.save('models_data/embeddings128.data')

In [9]:
# join embeddings with corresponding nodes (as x feature)
for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=embeddings.wv[str(node)].copy())

In [10]:
nx.readwrite.write_gpickle(G, 'data/network128.gpickle')