# Init

In [3]:
%matplotlib inline

import csv
import datetime
import itertools
import json
import math
import matplotlib
import time
import logging
import sys
import sqlite3
import os
import random
import warnings

import gensim

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict

from sklearn.manifold import TSNE
from scipy.sparse import coo_matrix, csr_matrix
# csc_spdiags
from scipy.io import loadmat, savemat
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize, MultiLabelBinarizer

In [4]:
# to ignore sklearn warning
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [5]:
import scipy.sparse

In [6]:
from scoring import main

In [7]:
from sklearn import random_projection

In [8]:
%load_ext autoreload
%autoreload 2

# Load Data

## Blogcatalog

In [9]:
blogcatalog = loadmat('blogcatalog/blogcatalog.mat')

In [10]:
blog_labels = blogcatalog['group']

In [11]:
blog_A = blogcatalog['network']

In [12]:
blog_A

<10312x10312 sparse matrix of type '<class 'numpy.float64'>'
	with 667966 stored elements in Compressed Sparse Column format>

In [13]:
N = blog_A.shape[0]

Obtain its transition matrix (note that the input matrix is symmetric):

In [16]:
normalizer = scipy.sparse.spdiags(np.squeeze(1.0 / scipy.sparse.csc_matrix.sum(blog_A, axis=1) ), 0, N, N)

In [17]:
blog_trans = normalizer @ blog_A

# Utility Functions

Write the graph to a file as adjacency list:

In [18]:
def sparse2graph(x):
    G = defaultdict(lambda: set())
    cx = x.tocoo()
    for i, j, v in zip(cx.row, cx.col, cx.data):
        G[i].add(j)
    
    lines = []
    for k, v in G.items():
        s = str(k) + ' ' + ' '.join(map(str, v))
        lines.append(s)
    return lines

In [19]:
G = nx.parse_adjlist(sparse2graph(blog_A))

# Random Projection

Iterative computing:

In [20]:
# projection method: choose from Gaussian and Sparse
# input matrix: choose from adjacency and transition matrix
def randne_projection(A, q=3, dim=128, projection_method='gaussian', input_matrix='adj'):
    assert input_matrix == 'adj' or input_matrix == 'trans'
    assert projection_method == 'gaussian' or projection_method == 'sparse'
    
    if input_matrix == 'adj':
        M = A
    else:
        N = A.shape[0]
        normalizer = scipy.sparse.spdiags(np.squeeze(1.0 / scipy.sparse.csc_matrix.sum(A, axis=1) ), 0, N, N)
        M = normalizer @ A
    # Gaussian projection matrix
    if projection_method == 'gaussian':
        transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42)
    # Sparse projection matrix
    else:
        transformer = random_projection.SparseRandomProjection(n_components=dim, random_state=42)
    # Random projection for A
    cur_U = transformer.fit_transform(M)
    U_list = [cur_U]
    
    for i in range(2, q + 1):
        cur_U = M @ cur_U
        U_list.append(cur_U)
    return U_list

Brute force computing, designed for the clipped transition matrix (introduced below):

**Shall I use different projection matrices???**

In [21]:
def get_clipped_matrices(A, q=3, beta=None):
    M_list = []
    N = A.shape[0]
    normalizer = scipy.sparse.spdiags(np.squeeze(1.0 / scipy.sparse.csc_matrix.sum(A, axis=1) ), 0, N, N)
    M = normalizer @ A
    if beta is None:
        beta = 1.0 / N

    for i in range(1, q + 1):
        print ('Current order: ', i)
        _mat = get_grarep_clipped_matrix(trans_pow / trans_pow.sum(axis=0), beta)
        M_list.append(_mat)
        # is this going to cause loss of accuracy?
        trans_pow = trans @ trans_pow
    
    return M_list

# takes in pre-computed clipped transition matrices
def randne_projection_bruteforce(M_list, d=128):
    U_list = []
    
    # Gaussian projection matrix
    if projection_method == 'gaussian':
        transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42)
    # Sparse projection matrix
    else:
        transformer = random_projection.SparseRandomProjection(n_components=dim, random_state=42)

    U_list = []
    
    for M in M_list:
        cur_U = transformer.fit_transform(M)
        U_list.append(cur_U)
    return U_list

# Merge Embeddings

Merge embeddings (projections) from different powers of $A$ together:

In [22]:
# When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
def randne_merge(U_list, weights, normalization=False):
    # print (type(U_list[0]))
    dense_U_list = [_U.todense() for _U in U_list] \
        if type(U_list[0]) == csc_matrix or type(U_list[0]) == csr_matrix else U_list
    # print (type(dense_U_list[0]))
    _U_list = [normalize(_U, norm='l2', axis=1) for _U in dense_U_list] if normalization else dense_U_list

    if weights is None:
        # print (type(_U_list[0]))
        return np.concatenate(_U_list, axis=1)
    U = np.zeros_like(_U_list[0])
    for cur_U, weight in zip(_U_list, weights):
        U += cur_U * weight
    return U

# All Variants to be Considered

Here lists all variants that we consider:

1. Different projection matrices: Gaussian and Signed Binary (the sparse one)
2. Different input graph matrices: adjacency matrix and transition matrix and shifted transition matrix (see the GraRep paper -- may also need to tune $\beta$). Also, the shifted transition matrix one requires exact computation of $A^k$.
4. Different weight combinations of different powers: use the recommended weight combination in the paper (or do a grid search here?), and also only $A$, only $A^2$, only $A^3$ and so on. Also use one averaged over all powers of $A$, so that it is comparable to DeepWalk. Or alternatively, just take the concatenation of different embedding vectors.
4. Normalization: do we (L2) normalize the embeddings computed from each power of $A$ or not?

Here are two sample configurations:

In [212]:
conf1 = {
    'projection_method': 'sparse',
    'input_matrix': 'trans',
    'weights': [1.0, 1e2, 1e3, 0, 0, 0],
    'normalization': True,
    'd': 128,
}

conf2 = {
    'projection_method': 'gaussian',
    'input_matrix': 'adj',
    'weights': None,
    'normalization': False,
    'd': 256,
}

Then we generate all possible configurations. First, generate different weights:

In [279]:
max_order = 6
# should it be a mapping between name and configuration?
# using only A^k
weights = [[1.0 if order == chosen_order else 0 for order in range(max_order)] for chosen_order in range(max_order)]
# recommended setting for trans matrix
weights += [ [1.0, 1e2, 1e3, 0, 0, 0] ]
# recommended setting for adj matrix
weights += [ [0.1, 0.001, 0, 0, 0, 0] ]
# DeepWalk style weights
weights += [ [1.0 for _ in range(order)] + [0 for _ in range(max_order - order)] for order in range(2, max_order + 1)]
# no weights, use concatenation instead
weights += [None]
# TODO: grid search on weights?
weights

[[1.0, 0, 0, 0, 0, 0],
 [0, 1.0, 0, 0, 0, 0],
 [0, 0, 1.0, 0, 0, 0],
 [0, 0, 0, 1.0, 0, 0],
 [0, 0, 0, 0, 1.0, 0],
 [0, 0, 0, 0, 0, 1.0],
 [1.0, 100.0, 1000.0, 0, 0, 0],
 [0.1, 0.001, 0, 0, 0, 0],
 [1.0, 1.0, 0, 0, 0, 0],
 [1.0, 1.0, 1.0, 0, 0, 0],
 [1.0, 1.0, 1.0, 1.0, 0, 0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
 None]

In [280]:
keys = ['projection_method', 'input_matrix', 'weights', 'normalization', 'dim']

all_dims = [128, 256, 512]

all_conf_raw = itertools.product(
    # projection method
    ['gaussian', 'sparse'],
    # input matrix
    ['adj', 'trans'],
    weights,
    # normalization
    [True, False],
    # dimensionality
    all_dims
)
all_conf = [{key: val for key, val in zip(keys, conf)} for conf in all_conf_raw]
len(all_conf)

336

In [281]:
all_conf[124]

{'projection_method': 'gaussian',
 'input_matrix': 'trans',
 'weights': [1.0, 100.0, 1000.0, 0, 0, 0],
 'normalization': False,
 'dim': 256}

Finally, a wrapper function that takes in a configuration and return the learned embeddings:

In [386]:
# A is the adjacency matrix
def randne_wrapper(A, conf, computed_U_list):
    # avoid redundant computation, but may increase memory usage
    conf_fingerprint = get_projection_fingerprint(conf)
    if conf_fingerprint in computed_U_list:
        U_list = computed_U_list[conf_fingerprint]
        # print ('Fingerprint found for %s, skip computing the projected matrices.' % conf_fingerprint)
    else:
        # projection method: choose from Gaussian and Sparse
        # input matrix: choose from adjacency and transition matrix
        U_list = randne_projection(A,
                                   q=max_order,
                                   # q=len(conf['weights']),
                                   dim=conf['dim'],
                                   projection_method=conf['projection_method'],
                                   input_matrix=conf['input_matrix']
        )
        computed_U_list[conf_fingerprint] = U_list

    U = randne_merge(U_list, conf['weights'], conf['normalization'])
    return U

**Reduce Redundant Computation**

Create fingerprints for the projection part:

In [283]:
def get_projection_fingerprint(conf):
    return 'dim=' + str(conf['dim']) + ',projection_method=' + conf['projection_method'] \
        + ',input_matrix=' + conf['input_matrix']

In [284]:
get_projection_fingerprint(all_conf[133])

'dim=256,projection_method=gaussian,input_matrix=trans'

In [285]:
all_conf[133]

{'projection_method': 'gaussian',
 'input_matrix': 'trans',
 'weights': [1.0, 1.0, 0, 0, 0, 0],
 'normalization': True,
 'dim': 256}

**GraRep Style Input Matrix**

We defines a utility function to construct a GraRep style input matrix (see the GraRep paper). In other words, for each element $x$ in the $k$-step transition matrix, we set:
$x^\prime = \max(\log x - \log \beta, 0)$

Note: it is recommended that $\beta$ should be set to $1 / N$.

In [286]:
# takes in a numpy.matrix and return a CSC sparse matrix
def get_grarep_clipped_matrix(A, beta):
    tmp = np.log(A) - np.log(beta)
    tmp[np.isneginf(tmp)] = 0
    return csc_matrix(np.clip(tmp, 0, None) )

**Important**: this does not fit into the current RandNE framework, because we have to compute the GraRep clipped input matrix using brute force. Here we first compute the clipped matrices:

In [288]:
all_grarep_conf_raw = itertools.product(
    # projection method
    ['gaussian', 'sparse'],
    # input matrix, just a placeholder
    ['grarep_trans'],
    weights,
    # normalization
    [True, False],
    # dimensionality
    all_dims
)
all_grarep_conf = [{key: val for key, val in zip(keys, conf)} for conf in all_grarep_conf_raw]
len(all_grarep_conf)

168

In [289]:
all_grarep_conf

[{'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [1.0, 0, 0, 0, 0, 0],
  'normalization': True,
  'dim': 128},
 {'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [1.0, 0, 0, 0, 0, 0],
  'normalization': True,
  'dim': 256},
 {'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [1.0, 0, 0, 0, 0, 0],
  'normalization': True,
  'dim': 512},
 {'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [1.0, 0, 0, 0, 0, 0],
  'normalization': False,
  'dim': 128},
 {'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [1.0, 0, 0, 0, 0, 0],
  'normalization': False,
  'dim': 256},
 {'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [1.0, 0, 0, 0, 0, 0],
  'normalization': False,
  'dim': 512},
 {'projection_method': 'gaussian',
  'input_matrix': 'grarep_trans',
  'weights': [0, 1.0, 0, 0, 0, 0],
  'normalization': True,
  'd

In [None]:
M_list = get_clipped_matrices(blog_A, q=max_order, beta=None)

for dim in all_dims:
    for 
    U_list = randne_projection_bruteforce(M_list, d=dim)
    U_list = randne_projection(A,
                               q=len(conf['weights']),
                               dim=conf['dim'],
                               projection_method=conf['projection_method'],
                               input_matrix=conf['input_matrix']
    )
    U = randne_merge(U_list, conf['weights'], conf['normalization'])

# Grid Search on all Hyperparameters

Define the embedding file's name based on the hyperparameters:

In [383]:
def get_emb_filename(prefix, conf):
    return prefix + '-dim=' + str(conf['dim']) + ',projection_method=' + conf['projection_method'] \
        + ',input_matrix=' + conf['input_matrix'] + ',normalization=' + str(conf['normalization']) \
        + ',weights=' + (','.join(map(str, conf['weights'])) if conf['weights'] is not None else 'None') + '.mat'

In [345]:
get_emb_filename('data/blog', all_conf[1])

'data/blog-dim=256,projection_method=gaussian,input_matrix=adj,normalization=True,weights=1.0,0,0,0,0,0.mat'

We start with non-clipped input matrices:

In [438]:
# 246, 330
INF = 1000

df = pd.DataFrame()

prefix = 'data/blog'

computed_U_list = {}
# for conf in [all_conf[21]]:
for conf in all_conf[246:]:
    # print (conf)
    emb_filename = get_emb_filename(prefix, conf)
    print (emb_filename)
    # first check if this file already exists
    path = Path(emb_filename)
    if not path.is_file():
        U = randne_wrapper(blog_A, conf, computed_U_list)
        savemat(emb_filename, {'emb': U})
    else:
        print ('File %s already exists, skipped.' % emb_filename)
    f1_scores = main(
        ["--emb", emb_filename,
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"],
        )
    # see https://stackoverflow.com/questions/38987/how-to-merge-two-dictionaries-in-a-single-expression
    df = df.append(pd.Series({**conf, **f1_scores}), ignore_index=True)
    df.to_csv('data/blog-scores.txt', sep='\t', index=False, header=True)

df

data/blog-dim=128,projection_method=sparse,input_matrix=adj,normalization=True,weights=None.mat
File data/blog-dim=128,projection_method=sparse,input_matrix=adj,normalization=True,weights=None.mat already exists, skipped.
data/blog-dim=256,projection_method=sparse,input_matrix=adj,normalization=True,weights=None.mat
File data/blog-dim=256,projection_method=sparse,input_matrix=adj,normalization=True,weights=None.mat already exists, skipped.
data/blog-dim=512,projection_method=sparse,input_matrix=adj,normalization=True,weights=None.mat
File data/blog-dim=512,projection_method=sparse,input_matrix=adj,normalization=True,weights=None.mat already exists, skipped.
data/blog-dim=128,projection_method=sparse,input_matrix=adj,normalization=False,weights=None.mat
File data/blog-dim=128,projection_method=sparse,input_matrix=adj,normalization=False,weights=None.mat already exists, skipped.
data/blog-dim=256,projection_method=sparse,input_matrix=adj,normalization=False,weights=None.mat
File data/blo

data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=False,weights=0,0,0,0,1.0,0.mat
File data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=False,weights=0,0,0,0,1.0,0.mat already exists, skipped.
data/blog-dim=512,projection_method=sparse,input_matrix=trans,normalization=False,weights=0,0,0,0,1.0,0.mat
File data/blog-dim=512,projection_method=sparse,input_matrix=trans,normalization=False,weights=0,0,0,0,1.0,0.mat already exists, skipped.
data/blog-dim=128,projection_method=sparse,input_matrix=trans,normalization=True,weights=0,0,0,0,0,1.0.mat
File data/blog-dim=128,projection_method=sparse,input_matrix=trans,normalization=True,weights=0,0,0,0,0,1.0.mat already exists, skipped.
data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=True,weights=0,0,0,0,0,1.0.mat
File data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=True,weights=0,0,0,0,0,1.0.mat already exists, skipped.
data/blog-dim=512,pr

data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=True,weights=1.0,1.0,1.0,1.0,0,0.mat
File data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=True,weights=1.0,1.0,1.0,1.0,0,0.mat already exists, skipped.
data/blog-dim=512,projection_method=sparse,input_matrix=trans,normalization=True,weights=1.0,1.0,1.0,1.0,0,0.mat
File data/blog-dim=512,projection_method=sparse,input_matrix=trans,normalization=True,weights=1.0,1.0,1.0,1.0,0,0.mat already exists, skipped.
data/blog-dim=128,projection_method=sparse,input_matrix=trans,normalization=False,weights=1.0,1.0,1.0,1.0,0,0.mat
File data/blog-dim=128,projection_method=sparse,input_matrix=trans,normalization=False,weights=1.0,1.0,1.0,1.0,0,0.mat already exists, skipped.
data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=False,weights=1.0,1.0,1.0,1.0,0,0.mat
File data/blog-dim=256,projection_method=sparse,input_matrix=trans,normalization=False,weights=1.0,1.0,1.0,1.0,0,0.m

Unnamed: 0,dim,input_matrix,macro,micro,normalization,projection_method,weights
0,128.0,adj,0.085381,0.241582,1.0,sparse,
1,256.0,adj,0.089190,0.254667,1.0,sparse,
2,512.0,adj,0.092246,0.263288,1.0,sparse,
3,128.0,adj,0.067133,0.193898,0.0,sparse,
4,256.0,adj,0.060223,0.191359,0.0,sparse,
5,512.0,adj,0.069246,0.198221,0.0,sparse,
6,128.0,trans,0.072988,0.223236,1.0,sparse,"[1.0, 0, 0, 0, 0, 0]"
7,256.0,trans,0.079541,0.239087,1.0,sparse,"[1.0, 0, 0, 0, 0, 0]"
8,512.0,trans,0.086045,0.257466,1.0,sparse,"[1.0, 0, 0, 0, 0, 0]"
9,128.0,trans,0.042442,0.183766,0.0,sparse,"[1.0, 0, 0, 0, 0, 0]"


# Result Analysis

First we merge the CSV files together:

In [None]:
# Run this in Shell
cp blog-scores.0.txt blog-scores.txt \
&& (tail --lines=+2 blog-scores.1.txt) >> blog-scores.txt \
&& (tail --lines=+2 blog-scores.2.txt) >> blog-scores.txt
# Somehow there is one line missing...

Then load as a pandas dataframe:

In [451]:
df = pd.read_csv('data/blog-scores.txt', sep='\t')

In [452]:
df = df[['input_matrix', 'projection_method', 'dim', \
                                 'normalization', 'weights', 'macro', 'micro']]
df = df.sort_values(by=['macro'], ascending=False)
df.head(n=10)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
124,trans,gaussian,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.205684,0.339802
292,trans,sparse,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.203735,0.337155
291,trans,sparse,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.194552,0.322727
123,trans,gaussian,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.193426,0.321471
121,trans,gaussian,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.191718,0.300013
289,trans,sparse,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.189793,0.300888
290,trans,sparse,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.18139,0.306407
122,trans,gaussian,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.176912,0.302676
325,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.176424,0.34551
157,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.171909,0.343444


### Dimensionality

In [463]:
df[(df.input_matrix == 'trans') & (df.projection_method == 'gaussian') & (df.macro > 0.10)] \
    .sort_values(by=['normalization', 'weights', 'dim', 'macro'],
                 ascending=[True, False, True, False]).head(n=18)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
122,trans,gaussian,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.176912,0.302676
123,trans,gaussian,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.193426,0.321471
124,trans,gaussian,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.205684,0.339802
119,trans,gaussian,128.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.14331,0.2363
120,trans,gaussian,256.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.162454,0.259547
121,trans,gaussian,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.191718,0.300013
155,trans,gaussian,128.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.143781,0.303168
156,trans,gaussian,256.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.157987,0.325801
157,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.171909,0.343444
149,trans,gaussian,128.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.139127,0.301971


In [464]:
df[(df.input_matrix == 'trans') & (df.projection_method == 'sparse') & (df.macro > 0.10)] \
    .sort_values(by=['normalization', 'weights', 'dim', 'macro'],
                 ascending=[True, False, True, False]).head(n=18)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
290,trans,sparse,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.18139,0.306407
291,trans,sparse,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.194552,0.322727
292,trans,sparse,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.203735,0.337155
287,trans,sparse,128.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.144694,0.241106
288,trans,sparse,256.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.163789,0.261305
289,trans,sparse,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.189793,0.300888
323,trans,sparse,128.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.145962,0.305806
324,trans,sparse,256.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.167162,0.330787
325,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.176424,0.34551
317,trans,sparse,128.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.146562,0.306762


### Transition vs Adjacency Matrix

In [465]:
df[(df.input_matrix == 'trans')].head(n=8)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
124,trans,gaussian,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.205684,0.339802
292,trans,sparse,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.203735,0.337155
291,trans,sparse,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.194552,0.322727
123,trans,gaussian,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.193426,0.321471
121,trans,gaussian,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.191718,0.300013
289,trans,sparse,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.189793,0.300888
290,trans,sparse,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.18139,0.306407
122,trans,gaussian,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.176912,0.302676


In [466]:
df[(df.input_matrix == 'adj')].head(n=8)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
205,adj,sparse,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.158495,0.272308
38,adj,gaussian,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.154588,0.270395
37,adj,gaussian,256.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.147544,0.254353
204,adj,sparse,256.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.146408,0.256105
17,adj,gaussian,512.0,0.0,"[0, 0, 1.0, 0, 0, 0]",0.145886,0.264519
184,adj,sparse,512.0,0.0,"[0, 0, 1.0, 0, 0, 0]",0.145391,0.26752
59,adj,gaussian,512.0,0.0,"[1.0, 1.0, 1.0, 0, 0, 0]",0.145242,0.268172
226,adj,sparse,512.0,0.0,"[1.0, 1.0, 1.0, 0, 0, 0]",0.145067,0.264118


### Gaussian vs Sparse Random Projection

In terms of performance:

In [468]:
df[(df.projection_method == 'gaussian')].head(n=8)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
124,trans,gaussian,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.205684,0.339802
123,trans,gaussian,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.193426,0.321471
121,trans,gaussian,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.191718,0.300013
122,trans,gaussian,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.176912,0.302676
157,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.171909,0.343444
151,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.169497,0.343138
139,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 0, 0, 0]",0.165932,0.339843
145,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 0, 0]",0.164725,0.339905


In [469]:
df[(df.projection_method == 'sparse')].head(n=8)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
292,trans,sparse,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.203735,0.337155
291,trans,sparse,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.194552,0.322727
289,trans,sparse,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.189793,0.300888
290,trans,sparse,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.18139,0.306407
325,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.176424,0.34551
319,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.167199,0.342235
324,trans,sparse,256.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.167162,0.330787
313,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 0, 0]",0.167125,0.339616


In terms of runtime:

In [476]:
all_conf[124+1]

{'projection_method': 'gaussian',
 'input_matrix': 'trans',
 'weights': [1.0, 100.0, 1000.0, 0, 0, 0],
 'normalization': False,
 'dim': 512}

In [482]:
%time U = randne_wrapper(blog_A, all_conf[124+1], {})

CPU times: user 1.66 s, sys: 189 ms, total: 1.85 s
Wall time: 1.86 s


In [473]:
all_conf[292+1]

{'projection_method': 'sparse',
 'input_matrix': 'trans',
 'weights': [1.0, 100.0, 1000.0, 0, 0, 0],
 'normalization': False,
 'dim': 512}

In [483]:
%time U = randne_wrapper(blog_A, all_conf[292+1], {})

CPU times: user 7.33 s, sys: 9.56 s, total: 16.9 s
Wall time: 4.88 s


Why is the sparse matrix even slower? The claim in https://web.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf is that it should be $\sqrt{d}$ times faster.

### Normalization

In [486]:
df[(df.normalization == False)].head(n=8)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
124,trans,gaussian,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.205684,0.339802
292,trans,sparse,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.203735,0.337155
291,trans,sparse,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.194552,0.322727
123,trans,gaussian,256.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.193426,0.321471
290,trans,sparse,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.18139,0.306407
122,trans,gaussian,128.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.176912,0.302676
17,adj,gaussian,512.0,0.0,"[0, 0, 1.0, 0, 0, 0]",0.145886,0.264519
184,adj,sparse,512.0,0.0,"[0, 0, 1.0, 0, 0, 0]",0.145391,0.26752


In [485]:
df[(df.normalization == True)].head(n=8)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
121,trans,gaussian,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.191718,0.300013
289,trans,sparse,512.0,1.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.189793,0.300888
325,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.176424,0.34551
157,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.171909,0.343444
151,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.169497,0.343138
319,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.167199,0.342235
324,trans,sparse,256.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.167162,0.330787
313,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 0, 0]",0.167125,0.339616


### Weights

In [488]:
df.sort_values('macro',ascending = False).groupby('weights').head(2)

Unnamed: 0,input_matrix,projection_method,dim,normalization,weights,macro,micro
124,trans,gaussian,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.205684,0.339802
292,trans,sparse,512.0,0.0,"[1.0, 100.0, 1000.0, 0, 0, 0]",0.203735,0.337155
325,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.176424,0.34551
157,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0]",0.171909,0.343444
151,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.169497,0.343138
319,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 1.0, 0]",0.167199,0.342235
313,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 0, 0]",0.167125,0.339616
139,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 0, 0, 0]",0.165932,0.339843
145,trans,gaussian,512.0,1.0,"[1.0, 1.0, 1.0, 1.0, 0, 0]",0.164725,0.339905
307,trans,sparse,512.0,1.0,"[1.0, 1.0, 1.0, 0, 0, 0]",0.159598,0.333073


# Node Classification on Blogcatalog

## RandNE

Learn embeddings with the recommended configuration (trans matrix + recommended weights):

In [19]:
%%time
weights = list(map(lambda x: 0.01 * x, [1e2, 1e4, 1e5]))
U_list_fast = randne_projection(blog_trans, q=3, dim=128)
U_fast = randne_merge(U_list_fast, weights)

CPU times: user 238 ms, sys: 52.2 ms, total: 290 ms
Wall time: 290 ms


In [20]:
U_fast

array([[-0.48844904,  0.20275235, -0.4192839 , ..., -1.11367325,
        -4.4354135 , -1.8893595 ],
       [-1.52668501,  3.5871983 ,  0.85047499, ..., -1.22756421,
        -4.41966455, -1.55155192],
       [-0.44574345,  0.4061968 , -0.65713982, ..., -0.58740916,
        -4.95153347, -1.93410333],
       ...,
       [-1.41488359,  0.69459542, -1.08402925, ..., -1.40729178,
        -5.16958445, -2.06114091],
       [-2.5628838 ,  1.10293738, -2.63083881, ..., -0.99083508,
        -7.72526503, -1.04238015],
       [-3.67204472,  0.33042499,  4.1620681 , ..., -4.04393533,
        -9.70169837, -4.83837598]])

In [21]:
savemat('data/blog-trans-fast.mat', {'emb': U_fast})

Run classification:

In [309]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _macro_f1 = main(
        ["--emb", "data/blog-trans-fast.mat",
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"],
        )
    print (_macro_f1)

0.1740621209024954
CPU times: user 14.8 s, sys: 21.9 s, total: 36.7 s
Wall time: 9.96 s


## GraRep

A nice answer on truncated SVD in Python: https://stackoverflow.com/questions/31523575/get-u-sigma-v-matrix-from-truncated-svd-in-scikit-learn

In [20]:
from sparsesvd import sparsesvd
from scipy.sparse import csc_matrix

In [34]:
%%time
ut, s, vt = sparsesvd(blog_A, 128)

CPU times: user 3.22 s, sys: 30 ms, total: 3.25 s
Wall time: 3.24 s


For reference, here is the GraRep implementation: https://github.com/ShelsonCao/GraRep/blob/master/code/core/GetProbTranMat.m

In [164]:
def get_svd_emb(A, d):
    ut, s, vt = sparsesvd(A, d)
    return np.matmul(ut.T, np.diag(s ** 0.5) )

def grarep(A, q=3, d=128, beta=None):
    U_list = []
    
    N = A.shape[0]
    if beta is None:
        beta = 1.0 / N
    normalizer = spdiags(np.squeeze(1.0 / scipy.sparse.csc_matrix.sum(blog_A, axis=1) ), 0, N, N)
    trans = normalizer @ A
    trans_pow = trans
    
    for i in range(1, q + 1):
        print ('Current order: ', i)
        _mat = get_grarep_clipped_matrix(trans_pow / trans_pow.sum(axis=0), beta)
        _cur_U = get_svd_emb(_mat, d)
        U_list.append(_cur_U)
        # is this going to cause loss of accuracy?
        trans_pow = trans @ trans_pow
    return U_list

In [150]:
x = np.asarray([[0, 1], [2, 3]])
y = np.log(x)
y[np.isneginf(y)] = 0
y

  


array([[0.        , 0.        ],
       [0.69314718, 1.09861229]])

In [165]:
%time U_list = grarep(blog_A, q=6)

Current order:  1


  This is separate from the ipykernel package so we can avoid doing imports until


Current order:  2
Current order:  3
Current order:  4
Current order:  5
Current order:  6
CPU times: user 7min 45s, sys: 48.7 s, total: 8min 34s
Wall time: 8min 9s


In [166]:
U = np.concatenate(U_list, axis=1)
U.shape

(10312, 768)

In [169]:
savemat('data/blog-grarep-q=6.mat', {'emb': U})

Or alternatively, normalize before concatenation:

In [196]:
U_list_norm = [normalize(_U, norm='l2', axis=1) for _U in U_list]
U_norm = np.concatenate(U_list_norm, axis=1)
U_norm.shape

(10312, 768)

In [202]:
savemat('data/blog-grarep-q=6-norm.mat', {'emb': U_norm})

Then run classification:

In [189]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _ = main(
        ["--emb", "data/blog-grarep-q=6.mat",
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"])

Results, using embeddings of dimensionality 768
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.37347736152608596, 'macro': 0.2239223887022614}
Shuffle #2:      {'micro': 0.3574182603331277, 'macro': 0.21202142126653475}
Shuffle #3:      {'micro': 0.3666153846153846, 'macro': 0.21850897139673947}
Shuffle #4:      {'micro': 0.3614652127169405, 'macro': 0.21702878391208905}
Shuffle #5:      {'micro': 0.3688215850564994, 'macro': 0.2154206458435744}
Average score: {'micro': 0.36555956084960767, 'macro': 0.2173804422242398}
-------------------
CPU times: user 30.7 s, sys: 53.7 s, total: 1min 24s
Wall time: 15.3 s


For the original MATLAB GraRep implementation:

In [None]:
cd /home/local_hcchen/ws/GraRep/code/core
matlab
load('/home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.mat')
# should take about 5 minutes
tic; U = GraRep(network, 3, 128, false); toc
save('/home/local_hcchen/ws/large-network-embeddings/blogcatalog/grarep-steps=3-norm=false.mat', 'U')

Normalization!!!

In [181]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _ = main(
        ["--emb", "/home/local_hcchen/ws/large-network-embeddings/blogcatalog/grarep-steps=3-norm=false.mat",
          "--network","/home/local_hcchen/ws/large-network-embeddings/blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"])

Results, using embeddings of dimensionality 384
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.36686162844389725, 'macro': 0.2175807549452131}
Shuffle #2:      {'micro': 0.3672969455714613, 'macro': 0.21246681498005252}
Shuffle #3:      {'micro': 0.36046422258089306, 'macro': 0.2191963041617719}
Shuffle #4:      {'micro': 0.36952849024727386, 'macro': 0.21639958480866647}
Shuffle #5:      {'micro': 0.36678944137507674, 'macro': 0.21061840465392065}
Average score: {'micro': 0.3661881456437205, 'macro': 0.21525237270992492}
-------------------


In [182]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _ = main(
        ["--emb", "/home/local_hcchen/ws/large-network-embeddings/blogcatalog/grarep-steps=3.mat",
          "--network","/home/local_hcchen/ws/large-network-embeddings/blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"])

Results, using embeddings of dimensionality 384
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.38673312883435584, 'macro': 0.23039458041463504}
Shuffle #2:      {'micro': 0.38405351376287866, 'macro': 0.22862059986057068}
Shuffle #3:      {'micro': 0.37781868384721584, 'macro': 0.22020741182804598}
Shuffle #4:      {'micro': 0.3930960252171907, 'macro': 0.23724224031936236}
Shuffle #5:      {'micro': 0.38105117565698476, 'macro': 0.21206827600585948}
Average score: {'micro': 0.38455050546372516, 'macro': 0.22570662168569472}
-------------------


In [185]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _ = main(
        ["--emb", "/home/local_hcchen/ws/large-network-embeddings/blogcatalog/grarep-steps=6-norm=false.mat",
          "--network","/home/local_hcchen/ws/large-network-embeddings/blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"])

Results, using embeddings of dimensionality 768
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.3662305867951955, 'macro': 0.21994083650761226}
Shuffle #2:      {'micro': 0.3685823754789272, 'macro': 0.22101012232390843}
Shuffle #3:      {'micro': 0.3730311179408375, 'macro': 0.22665150849111165}
Shuffle #4:      {'micro': 0.36857866850956, 'macro': 0.22044264381551862}
Shuffle #5:      {'micro': 0.36790936232105953, 'macro': 0.23083254539261516}
Average score: {'micro': 0.3688664222091159, 'macro': 0.2237755313061532}
-------------------


In [186]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _ = main(
        ["--emb", "/home/local_hcchen/ws/large-network-embeddings/blogcatalog/grarep-steps=6.mat",
          "--network","/home/local_hcchen/ws/large-network-embeddings/blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"])

Results, using embeddings of dimensionality 768
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.3789691670501611, 'macro': 0.23489610360269006}
Shuffle #2:      {'micro': 0.3867208464310358, 'macro': 0.2315061374628617}
Shuffle #3:      {'micro': 0.3784530386740331, 'macro': 0.23274557954971678}
Shuffle #4:      {'micro': 0.37847809377402, 'macro': 0.2296160308062376}
Shuffle #5:      {'micro': 0.3832321524629217, 'macro': 0.23184508517220584}
Average score: {'micro': 0.38117065967843433, 'macro': 0.2321217873187424}
-------------------


## DeepWalk

And run classification for DeepWalk:

In [70]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    _ = main(
        ["--emb", "~/ws/deepwalk/example_graphs/blogcatalog.embeddings",
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "1"])

Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.34998850486627325, 'macro': 0.20068161161471323}
Average score: {'micro': 0.34998850486627325, 'macro': 0.20068161161471323}
-------------------
CPU times: user 3.15 s, sys: 5.17 s, total: 8.32 s
Wall time: 1.77 s


And for DeepWalk:

In [65]:
y_dw, preds_dw, indices_test_dw = main(
        ["--emb", "~/ws/deepwalk/example_graphs/blogcatalog.embeddings",
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "1"])

Results, using embeddings of dimensionality 128
-------------------
Train percent: 0.1
Shuffle #1:      {'micro': 0.357416781714987, 'macro': 0.21307696569035764}
Average score: {'micro': 0.357416781714987, 'macro': 0.21307696569035764}
-------------------


  'precision', 'predicted', average, warn_for)
