# Train node2vec

In [2]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import os
import collections
import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from node2vec import Node2Vec

In [5]:
# ------------------ number of citations for labeled data -------------- #
Dataset = "pubmed"
citation_edges = []
citation_path = "Data/"+Dataset+"/labeled_subgraph.txt"
# read in paper ID
with open(citation_path, 'r', encoding = 'utf8') as infile:
    for line in infile:
        citation_edges.append((line.strip("\n").split("\t")[0], line.strip("\n").split("\t")[1]))
print(len(citation_edges))
Unique_citation_edges = list(set(citation_edges))
print(len(Unique_citation_edges))
print(Unique_citation_edges[:20])

3319946
[]
3309530
[('18474589', '16829228'), ('25946105', '19258313'), ('21540366', '10.1006/jesp.1996.1318'), ('23147393', '21683756'), ('25348356', '11499453'), ('21366717', '16728156'), ('24741610', '8366922'), ('23251660', '15995707'), ('25110880', '7529365'), ('26780420', '24571091'), ('22385782', '3857016'), ('15716151', '10456101'), ('21626185', '7484596'), ('26261867', '17660825'), ('24662017', '18625687'), ('16161571', '10.1039/b304896a'), ('26296990', '22932219'), ('21569568', '10.1002/jmv.21445'), ('26399585', '21663954'), ('25182062', '18619499')]


In [3]:
print(type(Unique_citation_edges[0]))
print(Unique_citation_edges[0])
for item in Unique_citation_edges:
    print(item[0])
    print(item[1])
    break

<class 'tuple'>
('19072180', '16541065')
19072180
16541065


In [10]:
# ---------------- number of labeled paper in source/target pid ---------------- #
fileDir = "Data/pubmed/canopies_labeled/"
listfiles = os.listdir(fileDir)
LabeledRecords_pid = []

for file in listfiles:
    if not file.startswith('.'):
        with open(fileDir+file, 'r', encoding = 'utf8') as f:
            for line in f:
                read_data = line.split("\t")
                if(len(read_data)==13 or len(read_data)==12):
                    LabeledRecords_pid.append(read_data[0])
                else:
                    print(len(read_data))
        f.close()
print("Total record: ", len(LabeledRecords_pid))
labeled_unique_paperIDs = set(LabeledRecords_pid)
print("Total paper: ", len(labeled_unique_paperIDs))

source_list = [item for item in Unique_citation_edges if item[0] in labeled_unique_paperIDs]
target_list = [item for item in Unique_citation_edges if item[1] in labeled_unique_paperIDs]

print(source_list[:10])
print(len(source_list))
print(target_list[:10])
print(len(target_list))
unique_nodes = set([node for edge in Unique_citation_edges for node in edge])
print(len(unique_nodes))


Total record:  140266
Total paper:  135796
[('19072180', '16541065'), ('15700297', '9665690'), ('21802375', '17938237'), ('27514453', '11498582'), ('16242163', '9517450'), ('17052299', '12690092'), ('23941581', '19320160'), ('26395192', '25303559'), ('18334252', '10.1155/2007/234768'), ('23286790', '22089642')]
3309530
[('15700297', '9665690'), ('26395192', '25303559'), ('26069318', '18462767'), ('17710541', '15016595'), ('26973508', '21520351'), ('25697830', '22823871'), ('18460048', '12105369'), ('24780111', '19027483'), ('23933525', '22896687'), ('19328219', '18191318')]
181532
2011361


In [21]:
# --------- construct graph ------------- #
import networkx as nx
print(len(Unique_citation_edges))
citation_G=nx.Graph()

repreated_edge = []

for edge in Unique_citation_edges:
    if citation_G.has_edge(*edge):
        repreated_edge.append(edge)
    citation_G.add_edge(*edge)
print(repreated_edge[:10])
print(len(repreated_edge))

3309530
[('26450731', '26450730'), ('26543879', '26253204'), ('18083576', '18339401'), ('12225449', '12225448'), ('22064531', '20525378'), ('23517659', '23517658'), ('26429028', '26429029'), ('24058884', '24039896'), ('25318652', '24809667'), ('21473745', '21473746')]
382


In [22]:
# ---------- collect graph statistic ----- #
print("Is directed graph: ",nx.is_directed(citation_G))
print(nx.info(citation_G))
node_count = citation_G.number_of_nodes()
edge_count = citation_G.number_of_edges()
# return sum of degree (Sum of indegree and outdegree)
citation_degrees = citation_G.degree()
sum_of_edges = sum(citation_degrees.values())
average_degrees = sum_of_edges/len(citation_G)
print(type(citation_degrees))
print(sum_of_edges)
print(average_degrees)
# find max degree paper
max_degree = max(citation_degrees.values())
max_degree_item = [node for node in citation_degrees if citation_degrees[node]==max_degree]
print(max_degree)
print(max_degree_item)

Is directed graph:  False
Name: 
Type: Graph
Number of nodes: 2011361
Number of edges: 3309148
Average degree:   3.2905
<class 'dict'>
6618296
3.2904565615023857
2052
['26799652']


In [None]:
# nx.draw(citation_G)
# plt.show()

In [None]:
print(citation_G.nodes()[:5])
print(citation_G.edges()[:5])
print(citation_G.neighbors("23942007"))

['21843670', '16309738', '27424009', '18400456', '24415859']
[('21843670', '16309738'), ('21843670', '10.1016/j.bej.2007.11.016'), ('21843670', '10.1016/j.lwt.2008.03.015'), ('21843670', '10.1016/j.jfoodeng.2008.04.003'), ('21843670', '11209901')]
['12517233', '10.1001/jama.286.2.139', '16336437', '10.1001/jama.296.6.652-b', '9106097', '21862737', '10.1002/hep.22206', '20158695', '10.1097/00007890-200012270-00024', '21421589']


In [None]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(citation_G, dimensions=100, walk_length=60, num_walks=10, workers=1)

Computing transition probabilities: 100%|██████████| 2011361/2011361 [14:48<00:00, 2262.76it/s]
Generating walks (CPU: 1):  90%|█████████ | 9/10 [27:25:01<3:37:30, 13050.58s/it] 


In [None]:
print(dir(node2vec))

['FIRST_TRAVEL_KEY', 'NEIGHBORS_KEY', 'NUM_WALKS_KEY', 'PROBABILITIES_KEY', 'P_KEY', 'Q_KEY', 'WALK_LENGTH_KEY', 'WEIGHT_KEY', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_generate_walks', '_precompute_probabilities', 'd_graph', 'dimensions', 'fit', 'graph', 'num_walks', 'p', 'q', 'quiet', 'require', 'sampling_strategy', 'temp_folder', 'walk_length', 'walks', 'weight_key', 'workers']


In [None]:
# Embed nodes
model = node2vec.fit(window=10, min_count=1)

In [None]:
print(dir(model))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_input_data_sanity', '_check_training_sanity', '_clear_post_train', '_do_train_epoch', '_do_train_job', '_get_job_params', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_minimize_model', '_raw_word_count', '_save_specials', '_set_train_params', '_smart_save', '_train_epoch', '_train_epoch_corpusfile', '_update_job_params', '_worker_loop', '_worker_loop_corpusfile', 'accuracy', 'alpha', 'batch_words', 'build_vocab', 'build_vocab_from_freq', 'callbacks', 'cbow_mean', 'clear_sims', 'compute_loss', 'cor

In [26]:
print(dir(model.wv))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', 'accuracy', 'add', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'get_keras_embedding', 'get_vector', 'index2entity', 'index2word', 'init_sims', 'load', 'load_word2vec_format', 'log_accuracy', 'log_evaluate_word_pairs', 'most_similar', 'most_similar_cosmul', 'most_similar_to_given', 'n_similarity', 'rank', 'relative_cosine_similarity', 'save', 'save_word2vec_format', 'similar_by_vector', 'similar_by_word', 'similarity',

In [30]:
print(len(model.wv.vocab))
print(dict(list(model.wv.vocab.items())[:10]))

2011361
{'7935409': <gensim.models.keyedvectors.Vocab object at 0x7f17e51f3f98>, '16793553': <gensim.models.keyedvectors.Vocab object at 0x7f17e4d3ef60>, '7584145': <gensim.models.keyedvectors.Vocab object at 0x7f2e2b728048>, '10.1126/science.291.5507.1155d': <gensim.models.keyedvectors.Vocab object at 0x7f2e2b728a90>, '11891060': <gensim.models.keyedvectors.Vocab object at 0x7f2e2b728320>, '8338689': <gensim.models.keyedvectors.Vocab object at 0x7f2e2b7280f0>, '10570990': <gensim.models.keyedvectors.Vocab object at 0x7f2e2b732ba8>, '9367984': <gensim.models.keyedvectors.Vocab object at 0x7f2e2b732b70>, '17996480': <gensim.models.keyedvectors.Vocab object at 0x7f1a0526ee80>, '10036187': <gensim.models.keyedvectors.Vocab object at 0x7f17e4958a58>}


In [32]:
print(dir(model.save))

['__call__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__func__', '__ge__', '__get__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__self__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']


In [33]:
# -----------------   save model ------------------ #
import os
# ----- generate file name ------------ #
newfileDir = "models/"+Dataset+"/node2vec/"
if not os.path.exists(newfileDir):
    os.makedirs(newfileDir)
modelname = model.__str__()+"(node2vec)"
EMBEDDING_FILENAME = newfileDir+modelname
EMBEDDING_MODEL_FILENAME = newfileDir+modelname

# save embedding
model.wv.save_word2vec_format(EMBEDDING_FILENAME)
# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

In [16]:
# ------------- save vector trained as txt --------------- #
import os
newfileDir = "vectors/"+Dataset+"/node2vec/"
if not os.path.exists(newfileDir):
    os.makedirs(newfileDir)
modelname = model.__str__()+"(node2vec)"
EMBEDDING_FILENAME = newfileDir+modelname+".txt"

# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

In [34]:
print("Done")

Done
