In [18]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from pecanpy import node2vec
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
from time import time

https://towardsdatascience.com/run-node2vec-faster-with-less-memory-using-pecanpy-1bdf31f136de

https://towardsdatascience.com/overview-of-deep-learning-on-graph-embeddings-4305c10ad4a4

https://towardsdatascience.com/node2vec-explained-graphically-749e49b7eb6b

https://intel.github.io/scikit-learn-intelex/

In [16]:
# Load the graph    
G = nx.read_edgelist('../Data/coauthorship.edgelist', delimiter=' ', nodetype=int)

In [3]:
# Loads graph and generate walks
g = node2vec.SparseOTF(p=0.5, q=1, workers=4, verbose=True)
g.read_edg('../Data/coauthorship_tab.edgelist', weighted=False, directed=False)
print("Loaded graph\n")
walks = g.simulate_walks(num_walks = 10, walk_length = 10, n_ckpts = 100, pb_len = 100)
print("Generated walks")

Loaded graph
100%|███████████████████████████████████████████████████████████████████| 2178010.0/2178010 [00:45<00:00, 48324.65it/s]
Generated walks


In [12]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to report progress'''
    def __init__(self):
        self.epoch = 0
        self.t0 = time()
    def on_epoch_begin(self, model):
        print("\nEpoch #{} start".format(self.epoch))
        self.t0 = time()
    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        print("Duration of epoch: {:.2}".format(time() - self.t0))
        self.epoch += 1

In [13]:
# Builds the word2vec embedding based on the previously computed walks

n_dim = 100
epoch_logger = EpochLogger()

model = Word2Vec(vector_size = n_dim, window = 8, min_count = 0, sg = 1, workers = 4, hs = 1)
print("Built model")
model.build_vocab(walks)
print("Built vocabulary")
model.train(walks, total_examples=model.corpus_count, epochs=5, callbacks = [epoch_logger])
print("Trained model")

Built model
Built vocabulary
Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
Trained model


In [20]:
embedding = np.zeros((G.number_of_nodes(), n_dim))

for idx, node in enumerate(G.nodes()):
    embedding[idx, :] = model.wv[str(node)]