In [1]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from node2vec import Node2Vec

In [2]:
# ------------------ number of citations for labeled data -------------- #
Dataset = "pubmed"
citation_edges = []
citation_path = "../Data/"+Dataset+"/labeled_subgraph.txt"
# read in paper ID
with open(citation_path, 'r', encoding = 'utf8') as infile:
    for line in infile:
        citation_edges.append((line.strip("\n").split("\t")[0], line.strip("\n").split("\t")[1]))
print(len(citation_edges))
print(citation_edges[:5])

3319946
[('23942007', '9106097'), ('23942007', '10.1001/jama.296.6.652-b'), ('23942007', '21862737'), ('23942007', '10.1002/hep.22206'), ('23942007', '20158695')]


In [3]:
# --------- construct graph ------------- #
import networkx as nx
citation_G=nx.Graph()
citation_G.add_edges_from(citation_edges)

In [4]:
# ---------- collect graph statistic ----- #
print(nx.info(citation_G))
node_count = citation_G.number_of_nodes()
edge_count = citation_G.number_of_edges()
citation_degrees = citation_G.degree()
sum_of_edges = sum(citation_degrees.values())
average_degrees = sum_of_edges/len(citation_G)
print(sum_of_edges)
print(average_degrees)

Name: 
Type: Graph
Number of nodes: 2011361
Number of edges: 3309148
Average degree:   3.2905
6618296
3.2904565615023857


In [5]:
print("Number of nodes: ", node_count)
print("Number of edges: ", edge_count)

Number of nodes:  2011361
Number of edges:  3309148


In [6]:
print(citation_G.nodes()[:5])
print(citation_G.edges()[:5])
print(citation_G.neighbors("23942007"))

['23942007', '9106097', '10.1001/jama.296.6.652-b', '21862737', '10.1002/hep.22206']
[('23942007', '9106097'), ('23942007', '10.1001/jama.296.6.652-b'), ('23942007', '21862737'), ('23942007', '10.1002/hep.22206'), ('23942007', '20158695')]
['9106097', '10.1001/jama.296.6.652-b', '21862737', '10.1002/hep.22206', '20158695', '10.1097/00007890-200012270-00024', '10.1001/jama.286.2.139', '16336437', '21421589', '12517233']


In [7]:
# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = Node2Vec(citation_G, dimensions=100, walk_length=20, num_walks=10, workers=1)

Computing transition probabilities: 100%|██████████| 2011361/2011361 [12:30<00:00, 2681.79it/s] 
Generating walks (CPU: 1): 100%|██████████| 10/10 [9:08:05<00:00, 3545.67s/it] 


In [8]:
print(dir(node2vec))

['FIRST_TRAVEL_KEY', 'NEIGHBORS_KEY', 'NUM_WALKS_KEY', 'PROBABILITIES_KEY', 'P_KEY', 'Q_KEY', 'WALK_LENGTH_KEY', 'WEIGHT_KEY', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_generate_walks', '_precompute_probabilities', 'd_graph', 'dimensions', 'fit', 'graph', 'num_walks', 'p', 'q', 'quiet', 'require', 'sampling_strategy', 'temp_folder', 'walk_length', 'walks', 'weight_key', 'workers']


In [9]:
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)

In [10]:
print(dir(model))

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_training_sanity', '_clear_post_train', '_do_train_job', '_get_job_params', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_minimize_model', '_raw_word_count', '_save_specials', '_set_train_params', '_smart_save', '_train_epoch', '_update_job_params', '_worker_loop', 'accuracy', 'alpha', 'batch_words', 'build_vocab', 'build_vocab_from_freq', 'callbacks', 'cbow_mean', 'clear_sims', 'compute_loss', 'corpus_count', 'cum_table', 'delete_temporary_training_data', 'doesnt_match', 'epochs', 'estimate_memory

In [12]:
print(model.vector_size)

100


In [25]:
import os
# ----- generate file name ------------ #
newfileDir = "../Data/"+Dataset+"/models/node2vec/citation_sample=140k/citation_embedding/"
if not os.path.exists(newfileDir):
    os.makedirs(newfileDir)
modelname = model.__str__()+"(node2vec)"
EMBEDDING_FILENAME = newfileDir+modelname+".txt"
EMBEDDING_MODEL_FILENAME = newfileDir+"model/"+modelname

# Save embeddings for later use
model.wv.save_word2vec_format(EMBEDDING_FILENAME)

# Save model for later use
model.save(EMBEDDING_MODEL_FILENAME)

FileNotFoundError: [Errno 2] No such file or directory: '../Data/pubmed/models/node2vec/citation_sample=140k/citation_embedding/model/Word2Vec(vocab=2011361, size=100, alpha=0.025)(node2vec).wv.vectors.npy'