# Init

In [1]:
%matplotlib inline

import csv
import itertools
import math
import matplotlib
import time
import logging
import sys
import os
import random
import warnings

import gensim
from gensim.models import KeyedVectors

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict

from sklearn import random_projection
from sklearn.manifold import TSNE
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, spdiags
from scipy.io import loadmat, savemat
from scipy.spatial.distance import cosine
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize, MultiLabelBinarizer

In [2]:
from fastrp_exp import *

In [3]:
from sklearn import preprocessing

In [4]:
from scoring import scoring

In [5]:
# to ignore sklearn warning
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [6]:
%load_ext autoreload
%autoreload 2

# Load Data

## Blogcatalog

In [21]:
blogcatalog = loadmat('example_graphs/blogcatalog.mat')
A = blogcatalog['network']
N = A.shape[0]
A

<10312x10312 sparse matrix of type '<class 'numpy.float64'>'
	with 667966 stored elements in Compressed Sparse Column format>

In [22]:
G = nx.from_scipy_sparse_matrix(A)

In [23]:
int_nodes = list(map( int, G.nodes()) )

In [24]:
min(int_nodes), max(int_nodes), len(int_nodes)

(0, 10311, 10312)

In [25]:
G.number_of_nodes(), G.number_of_edges()

(10312, 333983)

# Sample a Subgraph

In [26]:
import networkx as nx
from scipy.sparse import find
from random import sample
from scipy.spatial.distance import pdist, squareform

In [27]:
tree = nx.minimum_spanning_tree(G)
G_nodes, G_edges, tree_edges = G.nodes(), G.edges(), tree.edges()
n_edges_70 = int(0.7*len(G_edges))

In [28]:
non_tree_edges = [edge for edge in G_edges if edge not in tree_edges]

In [29]:
sampled_edges = sample(non_tree_edges, n_edges_70 - len(tree.edges()))
tree.add_edges_from(sampled_edges, weight=1.0)

In [30]:
A_70 = nx.to_scipy_sparse_matrix(tree)
savemat('example_graphs/blogcatalog-70.mat', {'network': A_70})
nx.write_edgelist(tree.to_directed(), open('example_graphs/blogcatalog-70.edgelist', 'wb'), data=['weight'])

In [31]:
#Artificially removed:
edges_to_test = [edge for edge in G.edges() if edge not in tree.edges()]

#Edges that never existed
negatives = []
while len(negatives)<len(edges_to_test):
    rand_edge = sample(G_nodes, 2)
    if rand_edge not in G_edges:
        negatives.append(rand_edge)

#Random sample from existing/non-existing edges
rand_pairs = [sample(G_nodes, 2) for i in range(len(edges_to_test))]

# FastRP

In [45]:
import optuna

In [46]:
prefix = 'result/blog'

In [47]:
%%time

def objective(trial):
    order_range = 1
    # Invoke suggest methods of a Trial object to generate hyperparameters.
    # weights = [trial.suggest_loguniform('weight' + str(order), 1.0, 64.0) for order in range(order_range)]
    weights = [trial.suggest_loguniform('weight' + str(order), 0.001, 1) for order in range(order_range)]
    alpha = trial.suggest_uniform('alpha', -0.4, 0.4)
    conf = {
        'projection_method': 'sparse',
        'input_matrix': 'adj',
        'weights': [1.0] + weights,
        'normalization': False,
        'dim': 512,
        'alpha': alpha,
        'C': 0.1
    }
    print(conf)
    # emb_filename = get_emb_filename(prefix, conf)
    # print (emb_filename)

    U = fastrp_wrapper(A_70, conf)
    distances_negative = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in negatives])
    distances_random = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in rand_pairs])
    distances_pos = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in edges_to_test])
    
    scores_negative = (distances_pos<distances_negative).sum() / len(distances_negative)
    scores_random = (distances_pos<distances_random).sum() / len(distances_random)
    return -scores_random

study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=100)  # Invoke optimization of the objective function.

{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.021544963446030133], 'normalization': False, 'dim': 512, 'alpha': 0.09782771951750907, 'C': 0.1}


[I 2019-05-20 03:03:34,992] Finished trial#0 resulted in value: -0.9126403513149359. Current best value is -0.9126403513149359 with parameters: {'weight0': 0.021544963446030133, 'alpha': 0.09782771951750907}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.012815620029244293], 'normalization': False, 'dim': 512, 'alpha': 0.2842701365477386, 'C': 0.1}


[I 2019-05-20 03:03:45,359] Finished trial#1 resulted in value: -0.9208044313588503. Current best value is -0.9208044313588503 with parameters: {'weight0': 0.012815620029244293, 'alpha': 0.2842701365477386}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.49747464155037435], 'normalization': False, 'dim': 512, 'alpha': 0.05504659209821233, 'C': 0.1}


[I 2019-05-20 03:03:56,148] Finished trial#2 resulted in value: -0.8325066121063925. Current best value is -0.9208044313588503 with parameters: {'weight0': 0.012815620029244293, 'alpha': 0.2842701365477386}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.7204584510765775], 'normalization': False, 'dim': 512, 'alpha': 0.07373119151153212, 'C': 0.1}


[I 2019-05-20 03:04:06,725] Finished trial#3 resulted in value: -0.8289235989819851. Current best value is -0.9208044313588503 with parameters: {'weight0': 0.012815620029244293, 'alpha': 0.2842701365477386}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.013404661038749922], 'normalization': False, 'dim': 512, 'alpha': 0.22125141173225782, 'C': 0.1}


[I 2019-05-20 03:04:17,110] Finished trial#4 resulted in value: -0.921363341484106. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.002647265831483687], 'normalization': False, 'dim': 512, 'alpha': -0.2773632922171927, 'C': 0.1}


[I 2019-05-20 03:04:27,827] Finished trial#5 resulted in value: -0.8228753929836818. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.005885942347918707], 'normalization': False, 'dim': 512, 'alpha': -0.3673447874774704, 'C': 0.1}


[I 2019-05-20 03:04:38,930] Finished trial#6 resulted in value: -0.8079245471330905. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.035202187692936954], 'normalization': False, 'dim': 512, 'alpha': -0.20181752538419329, 'C': 0.1}


[I 2019-05-20 03:04:49,285] Finished trial#7 resulted in value: -0.87859673636409. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.004060589059327722], 'normalization': False, 'dim': 512, 'alpha': 0.1699390993233325, 'C': 0.1}


[I 2019-05-20 03:04:59,834] Finished trial#8 resulted in value: -0.8950746045211837. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.01956318433099455], 'normalization': False, 'dim': 512, 'alpha': 0.3414474074759337, 'C': 0.1}


[I 2019-05-20 03:05:10,285] Finished trial#9 resulted in value: -0.9203153849992515. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.11586617570941557], 'normalization': False, 'dim': 512, 'alpha': -0.08365715486916458, 'C': 0.1}


[I 2019-05-20 03:05:20,832] Finished trial#10 resulted in value: -0.8635261240580867. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.0010617335057234677], 'normalization': False, 'dim': 512, 'alpha': -0.11181160808842949, 'C': 0.1}


[I 2019-05-20 03:05:31,349] Finished trial#11 resulted in value: -0.8032935775238286. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.07682484638330303], 'normalization': False, 'dim': 512, 'alpha': 0.26765047203957626, 'C': 0.1}


[I 2019-05-20 03:05:42,153] Finished trial#12 resulted in value: -0.869694096511802. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.23864664893770132], 'normalization': False, 'dim': 512, 'alpha': 0.3815373292452864, 'C': 0.1}


[I 2019-05-20 03:05:52,897] Finished trial#13 resulted in value: -0.8191326912520585. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.0016503299827000434], 'normalization': False, 'dim': 512, 'alpha': 0.21895043795308688, 'C': 0.1}


[I 2019-05-20 03:06:03,426] Finished trial#14 resulted in value: -0.8525275712360896. Current best value is -0.921363341484106 with parameters: {'weight0': 0.013404661038749922, 'alpha': 0.22125141173225782}.


{'projection_method': 'sparse', 'input_matrix': 'adj', 'weights': [1.0, 0.008390857817801337], 'normalization': False, 'dim': 512, 'alpha': -0.053801037170695454, 'C': 0.1}


KeyboardInterrupt: 

In [74]:
study.best_trial

FrozenTrial(number=65, state=<TrialState.COMPLETE: 1>, value=-0.9284806340660984, datetime_start=datetime.datetime(2019, 5, 19, 22, 44, 33, 349661), datetime_complete=datetime.datetime(2019, 5, 19, 22, 44, 37, 220980), params={'weight0': 20.485726627014053, 'alpha': 0.3978105115791798}, user_attrs={}, system_attrs={'_number': 65}, intermediate_values={}, params_in_internal_repr={'weight0': 20.485726627014053, 'alpha': 0.3978105115791798}, trial_id=65)

In [48]:
conf = {
        'projection_method': 'sparse',
        'input_matrix': 'adj',
        'weights': [1.0, 0.0128],
        'normalization': False,
        'dim': 512,
        'alpha': 0.28,
        'C': 0.1
}

In [49]:
U = fastrp_wrapper(A_70, conf)
distances_negative = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in negatives])
distances_random = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in rand_pairs])
distances_pos = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in edges_to_test])

scores_negative = (distances_pos<distances_negative).sum() / len(distances_negative)
scores_random = (distances_pos<distances_random).sum() / len(distances_random)
print (scores_random)

0.9208643145865563


# RandNE

Run RandNE:

In [None]:
python3 /home/hcchen/RandNE-Python/src/randne.py \
--input /home/hcchen/fast-random-projection/example_graphs/blogcatalog-70.mat \
--output /home/hcchen/fast-random-projection/result/blogcatalog-70-randne-emb.mat -q 2 -d 512 --weights 1 0.01

In [41]:
U = loadmat('/home/hcchen/fast-random-projection/result/blogcatalog-70-randne-emb.mat')['emb']
distances_negative = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in negatives])
distances_random = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in rand_pairs])
distances_pos = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in edges_to_test])

scores_negative = (distances_pos<distances_negative).sum() / len(distances_negative)
scores_random = (distances_pos<distances_random).sum() / len(distances_random)
print (scores_random)

0.9124507210938669


# LINE

Compile the code:

In [None]:
g++ -I /home/hcchen/gsl/include -L /home/hcchen/gsl/lib -lm -pthread -Ofast -march=native -Wall -funroll-loops -ffast-math -Wno-unused-result line.cpp -o line -lgsl -lm -lgslcblas
g++ -I /home/hcchen/gsl/include -L /home/hcchen/gsl/lib -lm -pthread -Ofast -march=native -Wall -funroll-loops -ffast-math -Wno-unused-result reconstruct.cpp -o reconstruct
g++ -I /home/hcchen/gsl/include -L /home/hcchen/gsl/lib -lm -pthread -Ofast -march=native -Wall -funroll-loops -ffast-math -Wno-unused-result normalize.cpp -o normalize
g++ -I /home/hcchen/gsl/include -L /home/hcchen/gsl/lib -lm -pthread -Ofast -march=native -Wall -funroll-loops -ffast-math -Wno-unused-result concatenate.cpp -o concatenate

First add the GSL library to path in shell:

In [None]:
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/hcchen/gsl/lib

Run LINE:

In [None]:
nohup ./line -train /home/hcchen/fast-random-projection/example_graphs/blogcatalog-70.edgelist \
-output /home/hcchen/fast-random-projection/result/blogcatalog-70.line.emb \
-binary 0 -size 200 -order 2 -negative 5 -samples 1000 -rho 0.025 -threads 40 &

Link prediction:

In [53]:
line_scores_negative, line_scores_random = [], []
model = KeyedVectors.load_word2vec_format('/home/hcchen/fast-random-projection/result/blogcatalog-70.line.emb')
U = np.asarray([model[str(node)] for node in range(len(model.vocab) )])
# print (U.shape)

distances_negative = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in negatives])
distances_random = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in rand_pairs])
distances_pos = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in edges_to_test])

scores_negative = (distances_pos<distances_negative).sum() / len(distances_negative)
scores_random = (distances_pos<distances_random).sum() / len(distances_random)
print (scores_random)

0.465252757123609


# DeepWalk

Run the following command in shell:

In [None]:
nohup time deepwalk --format mat --input example_graphs/blogcatalog-70.mat \
--max-memory-data-size 0 --number-walks 80 --representation-size 128 --walk-length 40 --window-size 10 \
--workers 40 --output result/blogcatalog-70.deepwalk.emb &

Link prediction:

In [51]:
model = KeyedVectors.load_word2vec_format('/home/hcchen/fast-random-projection/result/blogcatalog-70.deepwalk.emb')
U = np.asarray([model[str(node)] for node in range(len(model.vocab) )])
# print (U.shape)

distances_negative = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in negatives])
distances_random = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in rand_pairs])
distances_pos = np.array([cosine(U[edge[0]], U[edge[1]]) for edge in edges_to_test])

scores_negative = (distances_pos<distances_negative).sum() / len(distances_negative)
scores_random = (distances_pos<distances_random).sum() / len(distances_random)
print (scores_random)

0.7481311442686761
