# Init

In [None]:
%matplotlib inline

import csv
import datetime
import json
import matplotlib
import time
import logging
import sys
import sqlite3
import os
import random

import gensim

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm_notebook as tqdm
from collections import Counter, defaultdict

from sklearn.manifold import TSNE
from scipy.sparse import coo_matrix, csr_matrix
from scipy.io import loadmat, savemat
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

In [None]:
from jupyter_client import find_connection_file

In [None]:
find_connection_file()

In [None]:
matplotlib.style.use('ggplot')
sns.set(color_codes=True)

# Read Data

In [None]:
ls /mnt/store1/hcchen/www-network

In [None]:
data_dir = '/mnt/store1/hcchen/www-network/'
rank_fname = 'ranks.txt'
edges_fname = 'edges.txt'
vertices_fname = 'vertices.txt'

Get node to ID mapping:

In [None]:
%%time
node_to_id = {}
with open(os.path.join(data_dir, vertices_fname), 'r') as fp:
    reader = csv.reader(fp, delimiter='\t')
    for row in reader:
        node_to_id[row[1]] = row[0]

Filter based on harmony centrality:

In [None]:
K = 200000

In [None]:
%%time
node_set = set()

with open(os.path.join(data_dir, rank_fname)) as fp:
    reader = csv.DictReader(fp, delimiter='\t')
    for i, row in enumerate(reader):
        if i >= K:
            break
        node_set.add(node_to_id[row['#host_rev']])

This mapping is huge, so we are going to only take the relevant part later (i.e. for top-K nodes).

In [None]:
%time node_to_id = {node: id for node, id in node_to_id.items() if id in node_set}

In [None]:
%time id_to_node = {id: node for node, id in node_to_id.items()}

The goal is to choose the subgraph with only the top-K most important nodes:

In [None]:
with open(os.path.join(data_dir, rank_fname), 'r') as fp:
    for i, row in enumerate(fp):
        print (row, end="")
        if i >= 5:
            break

Construct the subgraph consisting of top-K nodes: (this is slow, taking ~30min)

In [None]:
G = nx.Graph()

with open(os.path.join(data_dir, edges_fname)) as fp:
    for line in tqdm(fp, total=2504610000):
        row = line.strip().split('\t')
        if row[0] in node_set and row[1] in node_set
        
            # print (id_to_node[row[0]], id_to_node[row[1]])
            G.add_edge(row[0], row[1])

In [None]:
G.number_of_nodes()

In [None]:
G.number_of_edges()

In [None]:
nx.write_adjlist(G, open('network-200k.adjlist', 'wb'))

In [None]:
nx.write_adjlist(G, open('network-10k.adjlist', 'wb'))

**Note: we need additional post-processing to remove some unnecessary content in the adj list file!!**

But it seems that even without post-processing our algorithm is working well. Weird :-)

# DeepWalk

We run the following command in terminal:

In [None]:
deepwalk --format adjlist --input network-10k.adjlist \
--max-memory-data-size 0 --number-walks 80 --representation-size 128 --walk-length 40 --window-size 10 \
--workers 38 --output www-network/10k.embeddings

# KNN of Representative websites

What are representative website? We define a list of them here:

In [None]:
representative_sites = ['com.facebook',
                        'com.twitter',
                        'com.google',
                        'com.linkedin',
                        'com.instagram',
                        'com.amazon',
                        'com.google.mail',
                        'com.nytimes',
                        'org.acm',
                        'gov.treasury',
                        'com.baidu',
                        'com.dell',
                        'com.github'
                       ]

## RandNE

Read the 200K graph from file:

In [None]:
G = nx.read_adjlist('/mnt/store1/hcchen/ws/large-network-embeddings/www-network/network-200k.adjlist')

First, we need to map the nodes to range of 0 -> N - 1:

In [None]:
node_to_matid = {node: index for index, node in enumerate(G.nodes())}
matid_to_node = {index: node for index, node in enumerate(G.nodes())}

Construct a sparse matrix from the adj list:

In [None]:
%%time
rows, cols = [], []

for node in G.nodes():
    u = node_to_matid[node]
    for adj_node in G[node]:
        v = node_to_matid[adj_node]
        rows.append(u)
        cols.append(v)

In [None]:
m = csr_matrix( ([1.0] * len(rows), (rows, cols)), shape=(G.number_of_nodes(), G.number_of_nodes()) )

In [None]:
savemat('network-200k.mat', {'A': m})

Then we run RandNE in both MATLAB and Python (my own implementation!) to get the embedding vectors. We load the embeddings here; note that there are two versions of the embeddings, one built from the adjacency matrix and the other one built from the transition matrix:

In [None]:
ls /home/hcchen/ws/large-network-embeddings/www-network

To run my RandNE implementation on trans matrix: ``python3 src/randne.py --matfile-variable-name A --input /home/hcchen/ws/large-network-embeddings/www-network/network-200k.mat --output /home/hcchen/ws/large-network-embeddings/www-network/randne-py-trans-www-200k.mat --use-trans-matrix -q 3 -d 128 --weights 1 100 1000``

To run my RandNE implementation on adj matrix: ``python3 src/randne.py --matfile-variable-name A --input /home/hcchen/ws/large-network-embeddings/www-network/network-200k.mat --output /home/hcchen/ws/large-network-embeddings/www-network/randne-py-adj-www-200k.mat -q 2 -d 128 --weights 1 0.01``

In [None]:
emb_randne_py_trans_200k = loadmat('www-network/randne-py-trans-www-200k.mat')['emb']

In [None]:
emb_randne_py_adj_200k = loadmat('www-network/randne-py-adj-www-200k.mat')['emb']

In [None]:
emb_randne_trans_200k = loadmat('www-network/randne-trans-www-200k.mat')['U']

In [None]:
emb_randne_adj_200k = loadmat('www-network/randne-adj-www-200k.mat')['U_adj']

Utility function for querying mosy similar sites:

In [None]:
def most_similar_sites_randne(emb, site, k=10):
    site_id = node_to_matid[node_to_id[site]]
    all_sim = cosine_similarity(np.expand_dims(emb[site_id], axis=0), emb)
    neighbors = sorted([(sim, index) for index, sim in enumerate(all_sim[0])], key=lambda x:-x[0])[1:k+1]
    neighbor_names = [id_to_node[matid_to_node[index]] for (_, index) in neighbors]
    
    return neighbor_names

The distribution of cosine similarity is.. a bit weird?

In [None]:
_ = plt.hist(randne_sim0[0], bins=100)

## DeepWalk

Load the DeepWalk embeddings we got:

In [None]:
emb_200k_dw = gensim.models.KeyedVectors.load_word2vec_format('www-network/200k.embeddings', binary=False)

For classification, we need to index the nodes from 0 to N - 1 consecutively:

In [None]:
node_to_matid['283881577']

In [None]:
matid_to_node[1012]

In [None]:
emb_200k_dw_reindexed = np.asarray([emb_200k_dw.get_vector(matid_to_node[matid]) for matid in range(G.number_of_nodes())])

Save the re-indexed embeddings to a file:

In [None]:
savemat('www-network/deepwalk-www-200k.mat', {'emb': emb_200k_dw_reindexed})

In [None]:
def most_similar_sites(emb, site):
    site_id = node_to_id[site]
    neighbors = emb.most_similar(site_id)
    # print (neighbors)
    # id, similarity
    return [id_to_node[id] for id, _ in neighbors]

In [None]:
for site in representative_sites:
    print ('Site chosen:', site)
    print ('Nearest neighbors: ')
    print (most_similar_sites(emb_200k_dw, site), '\n\n')

# KNN: RandNE vs DeepWalk

Useful resource for finding top websites based on category: https://www.similarweb.com/top-websites/

In [None]:
nuanced_sites = [
#     'com.archdaily',
#     'com.premierleague',
#     'com.nba',
#     'com.audible',
#     'com.mountainproject',
#     'org.wikipedia',
#     'org.archive',
#     'org.sigir',
#     'com.delta',
#     'com.ford',
#     'com.citi',
#     'com.weather',
#     'com.imdb',
#     'com.chase'
    'edu.stonybrook',
    'com.baidu',
    'com.chase',
    'org.wikipedia'
]

In [None]:
reverse_site = lambda x: '.'.join(x.split('.')[::-1])

In [None]:
for site in nuanced_sites:
    print ('Site chosen:', site)
    print ('Nearest neighbors from RandNE Adj: ')
    print (list(map(reverse_site, most_similar_sites_randne(emb_randne_py_adj_200k, site))), '\n')

    print ('Nearest neighbors from RandNE Trans: ')
    print (list(map(reverse_site, most_similar_sites_randne(emb_randne_py_trans_200k, site))) )
    
    print ('\n**************************\n')

In [None]:
for site in nuanced_sites:
    print ('Site chosen:', site)
    print ('Nearest neighbors from DeepWalk: ')
    print (list(map(reverse_site, most_similar_sites(emb_200k_dw, site))), '\n')
    
    print ('Nearest neighbors from RandNE Adj: ')
    print (list(map(reverse_site, most_similar_sites_randne(emb_randne_adj_200k, site))), '\n')

#     print ('Nearest neighbors from RandNE Trans: ')
#     print (list(map(reverse_site, most_similar_sites_randne(emb_randne_trans_200k, site))) )
    
    print ('\n**************************\n')

In [None]:
for site in nuanced_sites:
    print ('Site chosen:', site)
    print ('Nearest neighbors from DeepWalk: ')
    print (list(map(reverse_site, most_similar_sites(emb_200k_dw, site))), '\n')
    
    print ('Nearest neighbors from RandNE Adj: ')
    print (list(map(reverse_site, most_similar_sites_randne(emb_randne_adj_200k, site))), '\n')

#     print ('Nearest neighbors from RandNE Trans: ')
#     print (list(map(reverse_site, most_similar_sites_randne(emb_randne_trans_200k, site))) )
    
    print ('\n**************************\n')

# Domain Classification

## Label Distribution

Here, we take the domain extensions as labels of nodes:

In [None]:
K_label = 10

In [None]:
node_to_label = {node: id_to_node[node].split('.')[0] for node in node_set}

In [None]:
label_counts = Counter(node_to_label.values())

Distribution of (the most common) labels among top 10k websites:

In [None]:
top_label_counts = sorted(label_counts.items(), key=lambda x: -x[1])[:K_label]

In [None]:
top_label_counts

In [None]:
top_labels, top_counts = zip(*top_label_counts)

In [None]:
plt.figure(figsize=(9, 6))
df = pd.DataFrame.from_dict({'label': top_labels, 'count': top_counts})
df = df.set_index('label')
_ = df.plot(kind='bar')

We also want these labels to have consecutive integer IDs:

In [None]:
top_labels

In [None]:
label_to_id = {}
for index, label in enumerate(top_labels):
    label_to_id[label] = index
label_to_id

In [None]:
top_labels_with_dot = list(map(lambda x: '.' + x, top_labels))

## Prepare Data for Classification

~~For classification, let's use these top 20 labels, and ignore the other examples. We also ensure the number of examples from each class is the same:~~

Randomly sample 1,000 nodes from each class, and ignore the websites which do not belong to the top K classes:

In [None]:
N = 200000

In [None]:
K_sample = 1000
sampled_nodes = [[] for _ in range(K_label)]
for i in range(N):
    if node_to_label[matid_to_node[i]] in label_to_id:
        sampled_nodes[label_to_id[node_to_label[matid_to_node[i]]]].append(i)

sampled_nodes = [sample for elements in sampled_nodes for sample in random.sample(elements, K_sample)]

Save the adjacency matrix + label matrix to a single .mat file (we are re-using the sparse matrix from the RandNE chapter):

In [None]:
N = len(sampled_nodes)
rows = list(range(N))
cols = [label_to_id[node_to_label[matid_to_node[node]]] for node in sampled_nodes]
label_mat = csr_matrix( ( ([1.0] * N), (rows, cols) ), shape=(N, K_label) )

In [None]:
label_mat

Sanity check:

In [None]:
np.sum(label_mat, axis=0)

In [None]:
savemat('www-network/www-200k-classification.mat', {'group': label_mat})

And we need to pick embeddings just for these sampled nodes.

For DeepWalk:

In [None]:
emb_200k_dw_classification = \
    np.asarray([emb_200k_dw.get_vector(matid_to_node[matid]) for matid in sampled_nodes])

In [None]:
savemat('www-network/deepwalk-www-200k-classification.mat', {'emb': emb_200k_dw_classification})

For RandNE MATLAB Adj:

In [None]:
emb_randne_adj_200k_classification = emb_randne_adj_200k[sampled_nodes]

In [None]:
savemat('www-network/randne-adj-www-200k-classification.mat', {'emb': emb_randne_adj_200k_classification})

For RandNE Python Adj:

In [None]:
emb_randne_py_adj_200k_classification = emb_randne_py_adj_200k[sampled_nodes]

In [None]:
savemat('www-network/randne-py-adj-www-200k-classification.mat', {'emb': emb_randne_py_adj_200k_classification})

For RandNE Python Trans:

In [None]:
emb_randne_py_trans_200k_classification = emb_randne_py_trans_200k[sampled_nodes]

In [None]:
savemat('www-network/randne-py-trans-www-200k-classification.mat', {'emb': emb_randne_py_trans_200k_classification})

# FastRP

In [None]:
import optuna
prefix = 'result/www'

In [None]:
%%time
order_range = 2
def objective(trial):

    # Invoke suggest methods of a Trial object to generate hyperparameters.
    weights = [trial.suggest_loguniform('weight' + str(order), 1.0, 64.0) for order in range(order_range)]
    alpha = trial.suggest_uniform('alpha', -1.0, 1.0)
    conf = {
        'projection_method': 'sparse',
        'input_matrix': 'trans',
        'weights': [1.0, 1.0] + weights,
        'normalization': True,
        'dim': 128,
        'alpha': alpha,
        'C': 0.1
    }
    emb_filename = get_emb_filename(prefix, conf)
    print (emb_filename)
    # first check if this file already exists
    path = Path(emb_filename)
    if not path.is_file():
        U = fastrp_wrapper(A, conf)
        savemat(emb_filename, {'emb': U})
    else:
        print ('File %s already exists, skipped.' % emb_filename)
    f1_scores = scoring(
        [
            "--emb", emb_filename,
            "--network","example_graphs/blogcatalog.mat",
            "--num-shuffles", "3",
            "--debug",
            "--C", str(conf['C']),
            "--training-percents", "10",
        ]
    )
    # there should only be one entry here
    return -f1_scores[0]['micro']

study = optuna.create_study()  # Create a new study.
study.optimize(objective, n_trials=100)  # Invoke optimization of the objective function.

## Run Classification

With 10% training data we get: `Average score: {'micro': 0.7724027777777778, 'macro': 0.5314847218405359}`

In [None]:
python scoring.py --emb ~/ws/large-network-embeddings/www-network/randne-adj-www-200k.mat \
--network ~/ws/large-network-embeddings/www-network/www-200k.mat

We get: `Average score: {'micro': 0.7302805555555556, 'macro': 0.36847663239140405}`

In [None]:
python scoring.py --emb ~/ws/large-network-embeddings/www-network/randne-trans-www-200k.mat \
--network ~/ws/large-network-embeddings/www-network/www-200k.mat

We get: `Average score: {'micro': 0.7369333333333333, 'macro': 0.2991826875988267}`

We also would like to output the confusion matrix of the classification result. So we also train these models inline here:

In [None]:
sys.path.append('/home/hcchen/ws/large-network-embeddings')
%load_ext autoreload
%autoreload 2

In [None]:
from scoring import main

In [None]:
confusion_mat_dw = main(
    ["--emb", "/home/hcchen/ws/large-network-embeddings/www-network/deepwalk-www-200k-classification.mat",
      "--network","/home/hcchen/ws/large-network-embeddings/www-network/www-200k-classification.mat",
      "--num-shuffles", "1"])

In [None]:
confusion_mat_randne_adj = main(
    ["--emb", "/home/hcchen/ws/large-network-embeddings/www-network/randne-adj-www-200k-classification.mat",
      "--network","/home/hcchen/ws/large-network-embeddings/www-network/www-200k-classification.mat",
      "--num-shuffles", "1"])

The two results below are based on my implementation:

In [None]:
confusion_mat_randne_py_adj = main(
    ["--emb", "/home/hcchen/ws/large-network-embeddings/www-network/randne-py-adj-www-200k-classification.mat",
      "--network","/home/hcchen/ws/large-network-embeddings/www-network/www-200k-classification.mat",
      "--num-shuffles", "1"])

In [None]:
confusion_mat_randne_py_trans = main(
    ["--emb", "/home/hcchen/ws/large-network-embeddings/www-network/randne-py-trans-www-200k-classification.mat",
      "--network","/home/hcchen/ws/large-network-embeddings/www-network/www-200k-classification.mat",
      "--num-shuffles", "1"])

## Confusion Matrix

In [None]:
df = pd.DataFrame(confusion_mat_dw)
df['top-level domain'] = top_labels_with_dot
df.set_index('top-level domain', inplace=True)
df.columns = top_labels_with_dot
df

In [None]:
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.6)
sns.heatmap(df, cmap='Blues', fmt='g', annot=True,annot_kws={"size": 14},
           vmin=0, vmax=1000) # anno font size
plt.savefig('deepwalk-classification-mat.pdf')

In [None]:
df = pd.DataFrame(confusion_mat_randne_adj)
df['top-level domain'] = top_labels_with_dot
df.set_index('top-level domain', inplace=True)
df.columns = top_labels_with_dot
df

In [None]:
plt.figure(figsize=(8, 6))
sns.set(font_scale=1.6)
sns.heatmap(df, cmap='Blues', fmt='g', annot=True,annot_kws={"size": 14},
           vmin=0, vmax=1000) # anno font size
plt.savefig('randne-classification-mat.pdf')

# Visualization

Another experiment is to see how well can both methods separate sites with different top-level domains in the embedding space. Let us still consider the same set of sites as in the classification experiment:

In [None]:
emb_200k_dw_classification = loadmat('www-network/deepwalk-www-200k-classification.mat')['emb']

In [None]:
N = emb_200k_dw_classification.shape[0]
N

In [None]:
tsne = TSNE(n_components=2, random_state=42)

For DeepWalk:

In [None]:
%%time
deepwalk_tsne = tsne.fit_transform(emb_200k_dw_classification)

In [None]:
deepwalk_tsne

In [None]:
filtered_domains = set([0, 1, 2])

In [None]:
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'lightpink', 'orange', 'purple']
for label in range(K_label):
    if label in filtered_domains:
        continue
    candidates = [i for i in range(N) if cols[i] == label]
    plt.scatter(deepwalk_tsne[candidates, 0], deepwalk_tsne[candidates, 1],
                s=40, c=colors[label], label=top_labels_with_dot[label])
plt.legend()
# plt.legend(bbox_to_anchor=(1., 0., 1., 1.), loc=1,
#            ncol=1, mode="expand", borderaxespad=0., fontsize=16)
# plt.tight_layout(rect=[0, 0, 0.88, 1])
plt.savefig('deepwalk-tsne.pdf')

For RandNE MATLAB Adj:

In [None]:
%%time
randne_tsne = tsne.fit_transform(emb_randne_adj_200k_classification)

In [None]:
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'lightpink', 'orange', 'purple']
for label in range(K_label):
    if label in filtered_domains:
        continue
    candidates = [i for i in range(N) if cols[i] == label]
    plt.scatter(randne_tsne[candidates, 0], randne_tsne[candidates, 1],
                s=40, c=colors[label], label=top_labels_with_dot[label])
plt.legend()
# plt.legend(bbox_to_anchor=(1., 0., 1., 1.), loc=1,
#            ncol=1, mode="expand", borderaxespad=0., fontsize=16)
# plt.tight_layout(rect=[0, 0, 0.88, 1])
plt.savefig('randne-tsne.pdf')

For RandNE Python Adj:

In [None]:
%%time
randne_py_adj_tsne = tsne.fit_transform(emb_randne_py_adj_200k_classification)

In [None]:
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'lightpink', 'orange', 'purple']
for label in range(K_label):
    if label in filtered_domains:
        continue
    candidates = [i for i in range(N) if cols[i] == label]
    plt.scatter(randne_py_adj_tsne[candidates, 0], randne_py_adj_tsne[candidates, 1],
                s=40, c=colors[label], label=top_labels_with_dot[label])
plt.legend()
plt.savefig('randne-adj-tsne.pdf')

For RandNE Python Trans:

In [None]:
%%time
randne_py_trans_tsne = tsne.fit_transform(emb_randne_py_trans_200k_classification)

In [None]:
plt.figure(figsize=(8, 6))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k', 'lightpink', 'orange', 'purple']
for label in range(K_label):
    if label in filtered_domains:
        continue
    candidates = [i for i in range(N) if cols[i] == label]
    plt.scatter(randne_py_trans_tsne[candidates, 0], randne_py_trans_tsne[candidates, 1],
                s=40, c=colors[label], label=top_labels_with_dot[label])
plt.legend()
plt.savefig('randne-trans-tsne.pdf')

## Grid Search on RandNE Weights

Another thing to consider is grid searching for better RandNE weights: how does this affect the visualization result?