# Benchmark of edsger.sssp.path_length

https://www.timlrx.com/2019/05/05/benchmark-of-popular-graph-network-packages/

[Stanford Large Network Dataset Collection](https://snap.stanford.edu/data/index.html)

In [1]:
import os

import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
plt.style.use('seaborn')
%matplotlib inline

from edsger.sssp import convert_sorted_graph_to_csr, path_length
from edsger.commons import INFINITY_PY

rs = 124
np.random.seed(rs)
data_dir_path = '../../data/'

##  Amazon product co-purchasing network, March 02 2003

https://snap.stanford.edu/data/amazon0302.html

Nodes: 262111   
Edges: 1234877

In [2]:
network_file_path = os.path.join(data_dir_path, "Amazon0302.txt")

In [3]:
amazon = pd.read_csv(network_file_path, sep='\t', skiprows=3, header=0)
amazon.columns = ['tail_vert', 'head_vert']
# amazon['cost'] = 1.
amazon['cost'] = np.random.rand(len(amazon))
amazon.head(2)

Unnamed: 0,tail_vert,head_vert,cost
0,0,1,0.106065
1,0,2,0.745471


In [4]:
amazon.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1234877 entries, 0 to 1234876
Data columns (total 3 columns):
tail_vert    1234877 non-null int64
head_vert    1234877 non-null int64
cost         1234877 non-null float64
dtypes: float64(1), int64(2)
memory usage: 28.3 MB


In [5]:
n_vertices = 262111

### NetworkX

In [6]:
%%time
G = nx.from_pandas_edgelist(amazon, 'tail_vert', 'head_vert', ['cost'], create_using=nx.DiGraph)

CPU times: user 5.75 s, sys: 165 ms, total: 5.92 s
Wall time: 5.93 s


In [7]:
%%time
cost_nx = nx.algorithms.single_source_dijkstra_path_length(G, 0, weight='cost')

CPU times: user 1.41 s, sys: 23.9 ms, total: 1.44 s
Wall time: 1.44 s


In [8]:
%%time
cost_nx_df = pd.DataFrame(data=cost_nx.values(), index=cost_nx.keys(), columns=['path_length']).sort_index()
cost_nx_df.head(2)

CPU times: user 174 ms, sys: 3.89 ms, total: 178 ms
Wall time: 177 ms


Unnamed: 0,path_length
0,0.0
1,0.106065


### Edsger

In [9]:
# amazon.sort_values(by=['tail_vert', 'head_vert'], ascending=True, inplace=True)
assert amazon.index.is_monotonic
assert amazon.index.is_unique
assert amazon.index.min() == 0
assert amazon.index.max() + 1 == len(amazon)

In [10]:
%%timeit -n 1 -r 3 -o 
tail_vert = amazon.tail_vert.values.astype(np.uint32)
head_vert = amazon.head_vert.values.astype(np.uint32)
edge_weights = amazon.cost.values
indptr = convert_sorted_graph_to_csr(tail_vert, head_vert, n_vertices)

26.7 ms ± 4.63 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


<TimeitResult : 26.7 ms ± 4.63 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)>

In [None]:
%%timeit -n 1 -r 3 -o 
np.insert(amazon.tail_vert.value_counts().sort_index().reindex(np.arange(n_vertices), fill_value=0).cumsum().values, 0, 0)

In [None]:
%%timeit -n 1 -r 3 -o 
cost_ed = path_length(head_vert, indptr, edge_weights, 0, n_vertices)

In [None]:
cost_ed_df = pd.DataFrame(data=cost_ed, columns=['path_length'])

### Check

In [None]:
cost_nx_df.equals(cost_ed_df)

## Google web graph

https://snap.stanford.edu/data/web-Google.html
    
Nodes: 875713  
Edges: 5105039  

In [None]:
network_file_path = os.path.join(data_dir_path, "web-Google.txt")
google = pd.read_csv(network_file_path, sep='\t', skiprows=3, header=0)
google.columns = ['tail_vert', 'head_vert']
# google['cost'] = 1.
google['cost'] = np.random.rand(len(google))
google.head(2)

In [None]:
google.info()

In [None]:
n_vertices = 875713

### NetworkX

In [None]:
%%time
G = nx.from_pandas_edgelist(google, 'tail_vert', 'head_vert', ['cost'], create_using=nx.DiGraph)

In [None]:
G.number_of_nodes()

In [None]:
G.number_of_edges()

In [None]:
%%time
cost_nx = nx.algorithms.single_source_dijkstra_path_length(G, 0, weight='cost')

In [None]:
%%time
cost_nx_df = pd.DataFrame(data=cost_nx.values(), index=cost_nx.keys(), columns=['path_length']).sort_index()
cost_nx_df.head(2)

In [None]:
len(cost_nx_df)

### Edsger

In [None]:
google.sort_values(by=['tail_vert', 'head_vert'], ascending=True, inplace=True)
google.reset_index(inplace=True, drop=True)
assert google.index.is_monotonic
assert google.index.is_unique
assert google.index.min() == 0
assert google.index.max() + 1 == len(google)

In [None]:
print(google.tail_vert.min(), google.tail_vert.max())
print(google.head_vert.min(), google.head_vert.max())

In [None]:
%%time
tail_vert = google.tail_vert.values.astype(np.uint32)
head_vert = google.head_vert.values.astype(np.uint32)
edge_weights = google.cost.values
indptr = convert_sorted_graph_to_csr(tail_vert, head_vert, 916428)

In [None]:
%%time
cost_ed = path_length(head_vert, indptr, edge_weights, 0, 916428)

In [None]:
cost_ed_df = pd.DataFrame(data=cost_ed, columns=['path_length'])

In [None]:
cost_ed_df = cost_ed_df[cost_ed_df.path_length < INFINITY_PY]

In [None]:
len(cost_ed_df)

### Check

In [None]:
cost_nx_df.equals(cost_ed_df)