In [4]:
import networkit as nk
import torch
from networkx.algorithms import approximation
import numpy as np
from tqdm import tqdm_notebook as tqdm
from numpy import dot
from numpy.linalg import norm
from tqdm import tqdm
import pickle
from nltk.stem import WordNetLemmatizer
from operator import itemgetter
import collections

from node2vec import Node2Vec
import networkx as nx


from datetime import datetime

In [31]:
dims = 128

In [6]:
threshold = 90

with open(fr"networkx_graphs/full_adult_intersect_bert_{threshold}_cos_thresh_nx_weighted.pickle", "rb") as input_file:
    nxnet = pickle.load(input_file)

In [7]:
nxnet.number_of_nodes()

16706

In [8]:
nx.is_weighted(nxnet)

True

In [9]:
# nxnet.nodes()

##### Node2Vec params

- graph: The first positional argument has to be a networkx graph. Node names must be all integers or all strings. On the output model they will always be strings.
- dimensions: Embedding dimensions (default: 128)
- walk_length: Number of nodes in each walk (default: 80)
- num_walks: Number of walks per node (default: 10)
- p: Return hyper parameter (default: 1)
- q: Inout parameter (default: 1)
- weight_key: On weighted graphs, this is the key for the weight attribute (default: 'weight')
- workers: Number of workers for parallel execution (default: 1)
- sampling_strategy: Node specific sampling strategies, supports setting node specific 'q', 'p', 'num_walks' and 'walk_length'. Use these keys exactly. If not set, will use the global ones which were passed on the object initialization`
- quiet: Boolean controlling the verbosity. (default: False)
- temp_folder: String path pointing to folder to save a shared memory copy of the graph - Supply when working on graphs that are too big to fit in memory during algorithm execution.
- seed: Seed for the random number generator (default: None). Deterministic results can be obtained if seed is set and workers=1.


In [37]:
p_val = 0.5
q_val = 1.0
n_walks = 10
walk_len = 20
seed=42
workers=1 # to make consecutive runs reproducable (paired w/ seed)

In [38]:
vecs  = Node2Vec(nxnet, dimensions=dims,num_walks=n_walks, walk_length=walk_len,  p=p_val,q=q_val, workers=workers, seed=seed, temp_folder='./node2vec_tmp2')

Computing transition probabilities:   0%|          | 0/16706 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [05:05<00:00, 30.53s/it]


In [39]:
vecs

<node2vec.node2vec.Node2Vec at 0x1b77d1f10>

In [40]:
# Embed nodes
model = vecs.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

In [23]:

# Look for most similar nodes
model.wv.most_similar('horse')  # Output node names are always strings


[('trees', 0.8439834713935852),
 ('claws', 0.8405680060386658),
 ('mammals', 0.8394131660461426),
 ('cattle', 0.8376344442367554),
 ('cat', 0.8335410952568054),
 ('flora', 0.8267657160758972),
 ('hara', 0.8212054371833801),
 ('trucks', 0.8131386041641235),
 ('sandals', 0.8115652799606323),
 ('branches', 0.8082591891288757)]

In [42]:
file_date_info = datetime.utcnow().strftime("%m-%d_%H:%M")

# Save embeddings for later use
model.wv.save_word2vec_format(f"networkx_embeddings/embedding_bert_vocab_{threshold}_p{p_val}_q{q_val}_nwalk{n_walks}_wlen{walk_len}_dims{dims}_weighted_seeded_{file_date_info}.wordvectors") # TODO: I think this is the right file type?

model_name = f"networkx_embeddings/model_bert_vocab_{threshold}_p{p_val}_q{q_val}_nwalk{n_walks}_wlen{walk_len}_dims{dims}_weighted_seeded_{file_date_info}.model"
# Save model for later use
model.save(model_name)

In [43]:
model

<gensim.models.word2vec.Word2Vec at 0x1b77d1190>

In [44]:
model_name

'networkx_embeddings/model_bert_vocab_90_p0.5_q1.0_nwalk10_wlen20_dims128_weighted_seeded_12-14_09:50.model'