In [5]:
# ! pip -q install cogdl
# ! pip -q install torch-geometric
# ! pip -q install git+https://github.com/palash1992/GEM.git

In [3]:
# ! export PATH=/home/semantic/harrando/graphner/snap/snap/examples/Release:$PATH

In [1]:
! echo $PATH

/home/semantic/harrando/graphner/snap/snap/examples/Release:/home/semantic/.conda/envs/ismail-transformers/bin:/opt/anaconda3/condabin:/home/semantic/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/home/semantic/.npm-new-global/bin:/usr/local/go/bin:/usr/local/cuda/bin:/usr/local/maven/bin:/home/semantic/go/bin:/usr/local/anaconda3/bin


In [2]:
# ! pip install  keras==2.0.2 # --upgrade

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3"

In [2]:
import pickle
vocabulary = pickle.load(open('edges/vocabulary.pickle', 'rb'))
word2id = {w:i for i,w in enumerate(vocabulary)}

In [3]:
len(word2id)

17377

In [4]:
from sklearn.metrics.pairwise import cosine_similarity as cos

import gem
from gem.utils import graph_util, plot_util
from gem.evaluation import visualize_embedding as viz
from gem.evaluation import evaluate_graph_reconstruction as gr
from time import time

from gem.embedding.gf       import GraphFactorization
from gem.embedding.hope     import HOPE
from gem.embedding.lap      import LaplacianEigenmaps
from gem.embedding.lle      import LocallyLinearEmbedding
from gem.embedding.node2vec import node2vec
from gem.embedding.sdne     import SDNE

In [6]:
%%time
edges_file = 'edges/all_edges.edgelist'
G = graph_util.loadGraphFromEdgeListTxt(edges_file, directed=True)
G = G.to_directed()

CPU times: user 3.78 s, sys: 177 ms, total: 3.96 s
Wall time: 3.95 s


In [7]:
len(G.edges), len(G.nodes)

(433498, 17377)

In [64]:
models = []
output_embeddings = {}
embeddings_dim = 300

# models.append(GraphFactorization(d=2, max_iter=100000, eta=1*10**-4, regu=1.0, data_set='karate'))
models.append(HOPE(d=embeddings_dim, beta=0.01))
# Training time: 99.340407


# LE takes embedding dimension (d) as input
models.append(LaplacianEigenmaps(d=embeddings_dim))
# Training time: 10.422501


# LLE takes embedding dimension (d) as input
models.append(LocallyLinearEmbedding(d=embeddings_dim))
# Training time: 167.512155


# node2vec takes embedding dimension (d),  maximum iterations (max_iter), random walk length (walk_len), number of random walks (num_walks), context size (con_size), return weight (ret_p), inout weight (inout_p) as inputs
models.append(node2vec(d=embeddings_dim, max_iter=5, walk_len=20, num_walks=10, con_size=2, ret_p=1, inout_p=1))

# SDNE takes embedding dimension (d), seen edge reconstruction weight (beta), first order proximity weight (alpha), lasso regularization coefficient (nu1), ridge regreesion coefficient (nu2), number of hidden layers (K), size of each layer (n_units), number of iterations (n_ite), learning rate (xeta), size of batch (n_batch), location of modelfile and weightfile save (modelfile and weightfile) as inputs
# models.append(SDNE(d=embeddings_dim, beta=5, alpha=1e-5, nu1=1e-6, nu2=1e-6, K=3, n_units=[50, 15,], n_iter=50, xeta=0.01, n_batch=500,))
#                modelfile=['enc_model.json', 'dec_model.json'],
#                weightfile=['enc_weights.hdf5', 'dec_weights.hdf5']))

In [65]:
for embedding in models:
    print ('Num nodes: %d, num edges: %d' % (G.number_of_nodes(), G.number_of_edges()))
    t1 = time()
    # Learn embedding - accepts a networkx graph or file with edge list
    Y, t = embedding.learn_embedding(graph=G, edge_f=None, is_weighted=True, no_python=True)
    print (embedding._method_name+':\n\tTraining time: %f' % (time() - t1))
    # Evaluate on graph reconstruction
    # MAP, prec_curv, err, err_baseline = gr.evaluateStaticGraphReconstruction(G, embedding, Y, None)
    output_embeddings[embedding._method_name] = (embedding, Y, t)
    #---------------------------------------------------------------------------------
    # print(("\tMAP: {} \t precision curve: {}\n\n\n\n"+'-'*100).format(MAP,prec_curv[:5]))
    #---------------------------------------------------------------------------------
    # Visualize
    # viz.plot_embedding2D(embedding.get_embedding(), di_graph=G, node_colors=None)
    # plt.show()

Num nodes: 17377, num edges: 433498
SVD error (low rank): 4.367611
hope_gsvd:
	Training time: 140.965614
Num nodes: 17377, num edges: 433498
Laplacian matrix recon. error (low rank): 132.831463
lap_eigmap_svd:
	Training time: 59.257024
Num nodes: 17377, num edges: 433498
lle_svd:
	Training time: 253.570139
Num nodes: 17377, num edges: 433498
node2vec_rw:
	Training time: 47.061241


In [19]:
for embedding_name in output_embeddings:
    # embedding, Y, t = output_embeddings[embedding_name]
    # MAP, prec_curv, err, err_baseline = gr.evaluateStaticGraphReconstruction(G, embedding, Y, None)
    # print(("\tMAP: {} \t precision curve: {}\n\n\n\n"+'-'*100).format(MAP,prec_curv[:5]))
    print(embedding_name)

hope_gsvd
lap_eigmap_svd
lle_svd
node2vec_rw


In [71]:
! ls

20M_classifier_all_available_features.ipynb   nodes_classifier.ipynb
conceptnet_en.csv			      onehot_pytorch_lightning.ipynb
data					      prepare_dataset.ipynb
edge_list_generation.ipynb		      README.md
edges					      runs
graph_embeddings_generation.ipynb	      snap
GraphNER_binary_representation_pytorch.ipynb  tempGraph.emb
hparams.yaml				      tempGraph.graph
lightning_logs


In [None]:
print("""
hope_gsvd
MAP: 0.0036296006863930073 	 precision curve: [0.0, 0.0, 0.0, 0.0, 0.0]



----------------------------------------------------------------------------------------------------
lap_eigmap_svd
MAP: 0.00243098873025156 	 precision curve: [0.0, 0.0, 0.0, 0.0, 0.0]



----------------------------------------------------------------------------------------------------
lle_svd
MAP: 0.002571021389252031 	 precision curve: [0.0, 0.0, 0.0, 0.0, 0.0]



----------------------------------------------------------------------------------------------------
node2vec_rw
MAP: 0.15163359174500898 	 precision curve: [1.0, 0.5, 0.3333333333333333, 0.25, 0.4]



----------------------------------------------------------------------------------------------------
""")

In [66]:
for embedding_name in output_embeddings:
    embedding, Y, t = output_embeddings[embedding_name]
    print(embedding_name, '=======================')
    for w1, w2 in [('paris', 'london'), ('paris', 'love'), ('paris', 'france'), ('love', 'hate'), ('ismail', '<span>'), ('ismail', '<given_name>')]:
        print(w1,'-', w2, cos([Y[word2id[w1]]], [Y[word2id[w2]]]))

paris - london [[0.24793893]]
paris - love [[0.29792616]]
paris - france [[-0.05031744]]
love - hate [[0.1618395]]
ismail - <span> [[0.04653347]]
ismail - <given_name> [[-0.02472678]]
paris - london [[-0.06605461]]
paris - love [[-0.02564922]]
paris - france [[-0.05200741]]
love - hate [[0.07855971]]
ismail - <span> [[-0.15597746]]
ismail - <given_name> [[-0.21047482]]
paris - london [[0.06273263]]
paris - love [[0.10852554]]
paris - france [[-0.06761545]]
love - hate [[-0.04141492]]
ismail - <span> [[-0.02893604]]
ismail - <given_name> [[-0.00720291]]
paris - london [[0.4962066]]
paris - love [[0.22527805]]
paris - france [[0.11837857]]
love - hate [[0.49896708]]
ismail - <span> [[0.39875194]]
ismail - <given_name> [[0.53926276]]


In [70]:
for embedding_name in output_embeddings:
    embedding, Y, t = output_embeddings[embedding_name]
    embeddings_dict = {w: Y[i] for i, w in enumerate(vocabulary)}
    pickle.dump(embeddings_dict, open('edges/'+embedding_name+'_all_embeddings.pickle', 'wb'))

In [68]:
embeddings_dict['<span>']

array([ 1.15255e-01,  8.77112e-02,  1.47464e-01,  2.46221e-01,
        1.09573e-01,  3.67702e-02,  1.38165e-01, -2.53024e-02,
       -2.54002e-01, -1.54566e-01,  2.79029e-01, -3.98017e-01,
        1.80914e-02, -2.81355e-02, -3.79365e-01, -1.74800e-01,
        2.86791e-01, -2.32325e-01, -2.32964e-01,  2.56432e-02,
        3.27784e-01, -2.37017e-01,  1.79768e-01,  6.11590e-02,
       -1.41298e-01,  3.00204e-02,  8.41616e-02, -3.22209e-02,
       -3.18940e-01,  5.03982e-01, -3.18677e-02,  2.87777e-02,
       -2.32932e-01, -8.80168e-01, -3.59775e-01,  3.80506e-02,
        1.55849e-01,  1.15023e-01, -6.20517e-01,  6.62915e-01,
       -5.02792e-01,  1.98779e-01, -4.49525e-01,  5.84935e-01,
       -5.43423e-01, -2.43011e-01,  4.47865e-01, -9.40753e-01,
        1.05947e-01,  7.03076e-01,  1.50620e-01,  1.54158e-01,
       -2.29329e-01, -4.76156e-01, -8.02836e-02, -8.08539e-04,
       -1.04304e-01, -2.13944e-03, -1.87402e-01, -2.08901e-02,
       -1.12400e-01, -1.68778e-01, -2.73255e-01,  1.587

array([[0.49309866]])

In [31]:
embedding.get_embedding(0)

TypeError: get_embedding() takes 1 positional argument but 2 were given