In [11]:
import networkx as nx
import node2vec
from node2vec import Node2Vec

from utils import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
G = nx.read_gpickle('data/network.gpickle')
G.number_of_nodes(), G.number_of_edges()

(60789, 75647)

To search for node embeddings I use node2vec approach. I tried to use network SVD from network-sklearn, but it didn't work well (I got very low scaled vectors with values about 1e-20). node2vec gives more pleasant embeddings in terms of using dot product or cosine similarity. By the way it's not so fast (model trains for 3,5 minutes for graph with 60k vertices and 75k edges with some hyperparameters tuning)

In [13]:
def generate_node_embeddings(graph: nx.Graph):
    model = Node2Vec(graph, dimensions=128, walk_length=30, workers=4)
    embeddings = model.fit(window=5, min_count=1)
    return embeddings

In [14]:
%%time
embeddings = generate_node_embeddings(G)

Computing transition probabilities: 100%|██████████| 60789/60789 [00:02<00:00, 21211.24it/s]
Generating walks (CPU: 1): 100%|██████████| 3/3 [00:18<00:00,  6.22s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:12<00:00,  6.26s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [00:19<00:00,  6.42s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:12<00:00,  6.46s/it]


CPU times: user 14min 3s, sys: 1.81 s, total: 14min 5s
Wall time: 4min 4s


In [15]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(0, 1)
(0, 24)
(0, 350192)
(1, 2)
(2, 3)


In [16]:
embeddings.wv.most_similar('0')

[('1', 0.9941486120223999),
 ('350192', 0.9853284358978271),
 ('24', 0.9732028245925903),
 ('2', 0.971156895160675),
 ('350191', 0.9545277953147888),
 ('200921', 0.9452282786369324),
 ('3', 0.9370497465133667),
 ('200923', 0.9238253831863403),
 ('332388', 0.9122708439826965),
 ('350190', 0.8987720608711243)]

In [17]:
embeddings.save('model_data/embeddings.data')

In [18]:
# join embeddings with corresponding nodes (as x feature)
for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=embeddings.wv[str(node)].copy())

In [19]:
nx.readwrite.write_gpickle(G, 'data/network.gpickle')