# Init

In [11]:
%matplotlib inline

import csv
import datetime
import glob
import itertools
import json
import math
import matplotlib
import time
import logging
import sys
import sqlite3
import os
import random
import warnings

import gensim

import scipy.sparse

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from sklearn import random_projection
from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict

from sklearn.manifold import TSNE
from scipy.sparse import coo_matrix, csr_matrix, csc_matrix, spdiags
from scipy.io import loadmat, savemat
from sklearn.metrics import f1_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize, MultiLabelBinarizer

In [2]:
# to ignore sklearn warning
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings("ignore", category=RuntimeWarning) 

In [3]:
from scoring import main

In [4]:
%load_ext autoreload
%autoreload 2

# Load Data

## Blogcatalog

In [5]:
blogcatalog = loadmat('blogcatalog/blogcatalog.mat')
blog_labels = blogcatalog['group']
blog_A = blogcatalog['network']
N = blog_A.shape[0]

# Utility Functions

## Random Walks

We first slightly modify DeepWalk so that it only produces random walks but do not train the word2vec model. Run the following command to generate the count file:

In [None]:
deepwalk --format mat --input example_graphs/blogcatalog.mat --max-memory-data-size 0 --number-walks 80 --representation-size 128 --walk-length 40 --window-size 10 --workers 20 --output example_graphs/blogcatalog.embeddings --only-walks

In [None]:
time deepwalk --format mat --input example_graphs/blogcatalog.mat --max-memory-data-size 0 \
--number-walks 1000 --representation-size 128 --walk-length 40 --window-size 7 \
--workers 10 --output example_graphs/blogcatalog.emb --only-walks

Then we load the counts and aggregate into the overall (normalized) transition prob matrices:

In [72]:
# sample file name: blogcatalog.embeddings.walks.4.order-5.npz
def get_approx_matrices(path, max_order, N):
    approx_M_list = []
    for order in range(1, max_order + 1):
        print ("Processing order %d." % order)
        coo_count = defaultdict(int)
        
        fname_list = glob.glob(path + '*order-%d.npz' % order)
        for fname in fname_list:
            print ("Processing count file %s." % fname)
            cx = scipy.sparse.load_npz(fname)
            for i, j, v in zip(cx.row, cx.col, cx.data):
                coo_count[(i, j)] += v
        rows, cols, vals = [], [], []
        for (u, v), val in coo_count.items():
            rows.append(u)
            cols.append(v)
            vals.append(val)
        M = scipy.sparse.coo_matrix( (vals, (rows, cols)), shape=(N, N))
        M = M / M.sum(axis=0)
        approx_M_list.append(0.5 * M)
                
    return approx_M_list

In [74]:
%time approx_M_list_large = \
get_approx_matrices('/home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks', 3, N)

Processing order 1.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-0.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-6.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-4.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-2.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-8.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-7.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-9.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.emb.walks.nwalks-1000-3.order-1.npz.
Processing count file /home/local_hcchen/ws/

In [65]:
%time approx_M_list = get_approx_matrices('/home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog', 3, N)

Processing order 1.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.1.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.12.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.11.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.8.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.2.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.7.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.9.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.embeddings.walks.4.order-1.npz.
Processing count file /home/local_hcchen/ws/deepwalk/example_graphs/blogcatalog.em

In [75]:
approx_M_list[2]

matrix([[0.00000000e+00, 0.00000000e+00, 1.55908949e-04, ...,
         1.17905990e-04, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [9.62463908e-05, 0.00000000e+00, 0.00000000e+00, ...,
         7.86039931e-05, 0.00000000e+00, 0.00000000e+00],
        ...,
        [1.44369586e-04, 0.00000000e+00, 1.55908949e-04, ...,
         0.00000000e+00, 3.84911470e-04, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         3.93019965e-05, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [76]:
approx_M_list_large[2]

matrix([[6.72133351e-05, 1.34807226e-04, 9.34276745e-05, ...,
         7.89959301e-05, 3.13636934e-05, 0.00000000e+00],
        [1.12022225e-05, 0.00000000e+00, 6.22851163e-06, ...,
         6.31967441e-06, 0.00000000e+00, 0.00000000e+00],
        [5.60111126e-05, 4.49357419e-05, 2.49140465e-05, ...,
         9.47951162e-05, 6.27273868e-05, 0.00000000e+00],
        ...,
        [9.33518543e-05, 8.98714838e-05, 1.86855349e-04, ...,
         1.39032837e-04, 1.56818467e-04, 3.64099763e-04],
        [3.73407417e-06, 0.00000000e+00, 1.24570233e-05, ...,
         1.57991860e-05, 0.00000000e+00, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         1.26393488e-05, 0.00000000e+00, 7.28199527e-04]])

In [77]:
exact_M_list[2]

matrix([[1.02250228e-04, 1.05998914e-04, 9.47405841e-05, ...,
         9.61452923e-05, 1.03776074e-04, 1.14272552e-04],
        [8.84432638e-05, 5.11748722e-05, 9.52171589e-05, ...,
         9.19331008e-05, 3.23180156e-05, 5.70000221e-05],
        [9.32400698e-05, 1.12309994e-04, 1.10652255e-04, ...,
         8.31216949e-05, 5.47075911e-05, 3.51553259e-05],
        ...,
        [1.24371428e-04, 1.42528317e-04, 1.09254782e-04, ...,
         1.66773730e-04, 1.83882044e-04, 4.29384108e-04],
        [1.21168684e-04, 4.52245750e-05, 6.49044113e-05, ...,
         1.65973931e-04, 5.09303610e-04, 3.23213705e-04],
        [1.59022097e-04, 9.50664388e-05, 4.97095860e-05, ...,
         4.61922289e-04, 3.85222958e-04, 1.88236909e-02]])

# Exact Computation of $P^k$

In [6]:
def get_debug_clipped_matrices(A, q=3):
    M_list = []
    N = A.shape[0]
    normalizer = spdiags(np.squeeze(1.0 / scipy.sparse.csc_matrix.sum(A, axis=1) ), 0, N, N)
    trans = normalizer @ A
    trans_pow = trans

    for i in range(1, q + 1):
        print ('Current order: ', i)
        _mat = trans_pow / trans_pow.sum(axis=0)
        M_list.append(_mat)
        # is this going to cause loss of accuracy?
        trans_pow = trans @ trans_pow
    
    return M_list

In [7]:
%time exact_M_list = get_debug_clipped_matrices(blog_A, q=6)

Current order:  1
Current order:  2
Current order:  3
Current order:  4
Current order:  5
Current order:  6
CPU times: user 1min 48s, sys: 8.94 s, total: 1min 57s
Wall time: 1min 57s


# Random Projection

In [39]:
max_order = 3

In [41]:
# takes in pre-computed clipped transition matrices
def randne_projection_bruteforce(M_list, dim=128, projection_method='gaussian'):
    assert projection_method == 'gaussian' or projection_method == 'sparse'
    
    # Gaussian projection matrix
    if projection_method == 'gaussian':
        transformer = random_projection.GaussianRandomProjection(n_components=dim, random_state=42)
    # Sparse projection matrix
    else:
        transformer = random_projection.SparseRandomProjection(n_components=dim, random_state=42)

    U_list = []
    
    for M in M_list:
        cur_U = transformer.fit_transform(M)
        U_list.append(cur_U)
    return U_list

In [42]:
# When weights is None, concatenate instead of linearly combines the embeddings from different powers of A
def randne_merge(U_list, weights, normalization=False):
    dense_U_list = [_U.todense() for _U in U_list] if type(U_list[0]) == csc_matrix else U_list
    _U_list = [normalize(_U, norm='l2', axis=1) for _U in dense_U_list] if normalization else dense_U_list

    if weights is None:
        return np.concatenate(_U_list, axis=1)
    U = np.zeros_like(_U_list[0])
    for cur_U, weight in zip(_U_list, weights):
        U += cur_U * weight
    return U

In [48]:
def get_emb_filename(prefix, conf):
    return prefix + '-dim=' + str(conf['dim']) + ',projection_method=' + conf['projection_method'] \
        + ',input_matrix=' + conf['input_matrix'] + ',normalization=' + str(conf['normalization']) \
        + ',weights=' + (','.join(map(str, conf['weights'])) if conf['weights'] is not None else 'None') \
        + ',beta=' + (str(conf['beta']) if 'beta' in conf else '') \
        + '.mat'

In [57]:
def get_grarep_clipped_matrix(A, beta):
    eps = 1e-9
    tmp = np.log(A) - np.log(beta) if abs(beta) > eps else np.log(A)
    tmp[np.isneginf(tmp)] = 0
    return csc_matrix(np.clip(tmp, 0, None) )

In [49]:
def get_classification_score(M_list, conf):
    emb_filename = get_emb_filename('data/blog-test-', conf)
    U_list = randne_projection_bruteforce(M_list,
                                          dim=conf['dim'],
                                          projection_method=conf['projection_method']
            )
    U = randne_merge(U_list, conf['weights'], conf['normalization'])
    savemat(emb_filename, {'emb': U})
    f1_scores = main(
        ["--emb", emb_filename,
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"])
    return f1_scores

# Classification Performance

In [58]:
beta = 1.0 / N
exact_clipped_M_list = [get_grarep_clipped_matrix(M, beta) for M in exact_M_list]

In [83]:
beta

9.69743987587277e-05

In [78]:
approx_clipped_M_list_large = [get_grarep_clipped_matrix(M, beta) for M in approx_M_list_large]

In [68]:
conf = {
    'projection_method': 'sparse',
    'input_matrix': 'grarep_trans',
    'weights': [1.0, 1.0, 1.0],
    'normalization': True,
    'dim': 512,
}

In [70]:
get_classification_score(exact_M_list[:3], conf)

defaultdict(float,
            {'micro': 0.32731473252752374, 'macro': 0.18046044711262538})

In [71]:
%time get_classification_score(exact_clipped_M_list[:3], conf)

CPU times: user 24.5 s, sys: 46.8 s, total: 1min 11s
Wall time: 28.6 s


defaultdict(float, {'micro': 0.361645079617954, 'macro': 0.20184524336535392})

In [59]:
%time get_classification_score(exact_clipped_M_list, conf)

CPU times: user 27.4 s, sys: 44 s, total: 1min 11s
Wall time: 16.5 s


defaultdict(float, {'micro': 0.3667918437850054, 'macro': 0.2250157153432001})

In [52]:
f1_scores_1 = get_classification_score(approx_M_list, conf)
f1_scores_1

defaultdict(float,
            {'micro': 0.22919203004951366, 'macro': 0.10683710371752282})

In [60]:
%time get_classification_score(approx_clipped_M_list, conf)

CPU times: user 22.3 s, sys: 45.4 s, total: 1min 7s
Wall time: 11.5 s


defaultdict(float, {'micro': 0.2431057314986264, 'macro': 0.11073911991539576})

In [81]:
%time get_classification_score(approx_M_list_large, conf)

CPU times: user 25.4 s, sys: 45.5 s, total: 1min 10s
Wall time: 15.5 s


defaultdict(float, {'micro': 0.311276348028716, 'macro': 0.1621858568652469})

In [82]:
%time get_classification_score(approx_clipped_M_list_large, conf)

CPU times: user 20.5 s, sys: 44.9 s, total: 1min 5s
Wall time: 9.75 s


defaultdict(float, {'micro': 0.30332423856713153, 'macro': 0.1582202812481839})

In [None]:
INF = 1000

df = pd.DataFrame()

prefix = 'data/blog'

computed_U_list = {}
# for conf in [all_conf[21]]:
for conf in all_conf[246:]:
    # print (conf)
    emb_filename = get_emb_filename(prefix, conf)
    print (emb_filename)
    # first check if this file already exists
    path = Path(emb_filename)
    if not path.is_file():
        U = randne_wrapper(blog_A, conf, computed_U_list)
        savemat(emb_filename, {'emb': U})
    else:
        print ('File %s already exists, skipped.' % emb_filename)
    f1_scores = main(
        ["--emb", emb_filename,
          "--network","blogcatalog/blogcatalog.mat",
          "--num-shuffles", "5"],
        )
    # see https://stackoverflow.com/questions/38987/how-to-merge-two-dictionaries-in-a-single-expression
    df = df.append(pd.Series({**conf, **f1_scores}), ignore_index=True)
    df.to_csv('data/blog-scores.txt', sep='\t', index=False, header=True)

df