# Implementation of Node2Vec algorithm

For more information, see: https://snap.stanford.edu/node2vec/


The algorithm on github: https://github.com/eliorc/node2vec

In [72]:
import json
import networkx as nx
import pandas as pd
from node2vec import Node2Vec
from pathlib import Path

ModuleNotFoundError: No module named 'sklearn'

## Embedding

In [63]:
# Read the gml file as a graph
gml_file = Path() / "preprocessed_data/removed_links_network.gml"
graph = nx.read_gml(gml_file)

In [64]:
# Precompute probabilities and generate walks
# ON WINDOWS ONLY WORKS WITH workers=1
node2vec = Node2Vec(graph, dimensions=64, walk_length=10, num_walks=50, workers=1)

# Embed nodes
# Any keywords acceptable by gensim.Word2Vec can be passed
# `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)
model = node2vec.fit(window=5, min_count=1, batch_words=4)  

Computing transition probabilities: 100%|██████████| 4039/4039 [00:47<00:00, 84.35it/s] 
Generating walks (CPU: 1): 100%|██████████| 50/50 [00:33<00:00,  1.49it/s]


In [74]:
embedding_df = (
    pd.DataFrame(
        [model.wv.get_vector(str(n)) for n in graph.nodes()],
        index = graph.nodes
    )
)

embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.974207,0.193191,0.139558,0.464225,0.036012,-1.139447,-0.544864,0.108720,-0.314548,-0.160553,...,0.138021,-0.284023,-0.198328,-0.611097,-0.201371,0.311776,-0.747307,-0.659602,-0.443513,-0.744545
1,1.014977,-0.192294,1.001090,0.115907,0.514993,-1.114572,-0.133250,0.301099,-0.596461,-0.251924,...,-0.295503,0.451169,0.192969,-0.111938,-0.014938,0.145939,-0.200672,-0.424829,-0.076237,-1.070277
2,0.801208,0.579900,-0.128558,0.296963,-0.076054,-0.577290,-0.611306,0.420491,-0.336979,-0.152004,...,-0.161568,-0.465333,-0.023316,-1.003290,-0.477046,0.297297,-1.270623,-0.749566,-0.555958,-0.883016
3,1.203821,-0.178482,0.195285,-0.016858,0.280334,-1.100920,-0.561796,0.295090,-0.773735,-0.325102,...,0.102434,0.031409,-0.174392,-0.361211,-0.346305,0.293268,-0.849077,-0.102255,-0.159760,-0.510552
4,0.304186,-0.005821,0.712837,0.437324,0.209903,-1.299875,-0.709939,0.378777,-1.028498,0.176931,...,0.603349,-0.266882,-0.064429,-0.501173,0.129333,0.636153,-0.802940,-0.590431,-0.273897,-0.566807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4034,0.260240,0.124923,1.164505,0.244557,0.312498,-0.718508,0.012484,-0.500372,-0.551307,-0.231645,...,-0.184850,-0.235115,0.196289,-0.825899,-0.460020,0.535695,-0.058060,-0.490595,0.862857,0.103056
4035,0.010028,0.001116,0.012551,0.001474,0.014384,0.005472,-0.005278,-0.006421,-0.000050,-0.008958,...,0.006773,-0.009629,0.005331,0.007383,-0.010573,0.004071,0.004592,-0.009265,0.004510,-0.003479
4036,0.589961,0.156393,1.312268,0.251175,0.170557,-0.755079,-0.046529,-0.397800,-0.502467,0.039672,...,-0.308101,-0.171525,0.208537,-1.206862,-0.574123,0.696195,-0.043852,-0.294316,0.649222,-0.021691
4037,0.159855,0.114777,1.180619,0.133373,0.300605,-0.884329,-0.092872,-0.409713,-0.481954,-0.200785,...,-0.045627,0.011234,0.176603,-0.937487,-0.461921,0.671828,-0.014732,-0.540675,0.623361,-0.091754


In [76]:
embedding_df.to_csv('embedded_data/node2vec.csv', index=False)