In [2]:
import sys
sys.path.append('../')

import networkx as nx
from node2vec import Node2Vec

from src.utils import load_embeddings
from src.embedding_gen import propEmbed

%load_ext autoreload
%autoreload 2

In [3]:
G = nx.read_gpickle('data/network.gpickle')
G.number_of_nodes(), G.number_of_edges()

(60789, 75647)

To search for node embeddings I use node2vec approach. I tried to use network SVD from network-sklearn, but it didn't work well (I got very low scaled vectors with values about 1e-20). node2vec gives more pleasant embeddings in terms of using dot product or cosine similarity. By the way it's not so fast (model trains for 3,5 minutes for graph with 60k vertices and 75k edges with some hyperparameters tuning)

In [4]:
def generate_node_embeddings(graph: nx.Graph):
    model = Node2Vec(graph, dimensions=128, num_walks=20, walk_length=25, weight_key='dist', temp_folder='temp/', workers=3)
    result = model.fit(window=5, min_count=1)
    return result

In [5]:
%%time
embeddings = generate_node_embeddings(G)

Computing transition probabilities: 100%|██████████| 60789/60789 [00:02<00:00, 21275.29it/s]
Generating walks (CPU: 1):   0%|          | 0/7 [00:00<?, ?it/s]
[A

KeyboardInterrupt: 

In [5]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(0, 1)
(0, 24)
(0, 350192)
(1, 2)
(2, 3)


In [6]:
embeddings.wv.most_similar('0')

[('1', 0.9977036118507385),
 ('24', 0.9961258172988892),
 ('2', 0.9940856695175171),
 ('200921', 0.9933603405952454),
 ('3', 0.9908052086830139),
 ('4', 0.98515385389328),
 ('211643', 0.9828546643257141),
 ('332388', 0.9808356165885925),
 ('350192', 0.9796992540359497),
 ('246773', 0.9690355062484741)]

In [7]:
embeddings.save('models_data/embeddings128.data')

In [8]:
# join embeddings with corresponding nodes (as x feature)
for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=embeddings.wv[str(node)].copy())

In [None]:
nx.readwrite.write_gpickle(G, 'data/network128.gpickle')

In [10]:
nx.adjacency_matrix(G).shape

(60789, 60789)


Generating walks (CPU: 1):  43%|████▎     | 3/7 [08:30<12:00, 180.20s/it]
[A
Generating walks (CPU: 1):  57%|█████▋    | 4/7 [12:48<10:27, 209.19s/it]