In [1]:
import itertools
import multiprocessing
import random

import networkx as nx
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([
        ('Facebook Northwestern University', '', 'socfb-Northwestern25/socfb-Northwestern25.edges.gz'),
        ('IMDB movies and actors', '', 'imdb/actors_movies.edges.gz'),
        ('IMDB actors costar', 'W', 'imdb/actors_costar.edges.gz'),
        ('Twitter US politics', 'DW', 'icwsm_polarization/retweet-digraph.edges.gz'),
        ('Enron Email', 'DW', 'email-Enron/email-Enron.edges.gz'),
        ('Enron Executive Email', '', 'ia-enron-only/ia-enron-only.edges'),
        ('Wikipedia math', 'D', 'enwiki_math/enwiki_math.edges.gz'),
        ('Internet routers', '', 'tech-RL-caida/tech-RL-caida.edges.gz'),
        ('US air transportation', '', 'openflights/openflights_usa.edges.gz'),
        ('World air transportation', '', 'openflights/openflights_world.edges.gz'),
        ('Yeast protein interactions', '', 'bio-yeast-protein-inter/bio-yeast-protein-inter.edges'),
        ('C. elegans brain', 'DW', 'celegansneural/celegansneural.edges'),
        ('Everglades ecological food web', 'DW', 'eco-everglades/eco-everglades.edges'),
    ],
    columns=['Name', 'Type', 'File'],
)

In [3]:
df = df.set_index('Name')

In [4]:
df.head()

Unnamed: 0_level_0,Type,File
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook Northwestern University,,socfb-Northwestern25/socfb-Northwestern25.edge...
IMDB movies and actors,,imdb/actors_movies.edges.gz
IMDB actors costar,W,imdb/actors_costar.edges.gz
Twitter US politics,DW,icwsm_polarization/retweet-digraph.edges.gz
Enron Email,DW,email-Enron/email-Enron.edges.gz


In [7]:
def single_source_average_path_length(G, node):
    return np.average(list(nx.single_source_shortest_path_length(G, node).values()))

def estimated_average_path_length(G, N_sample=100):
    if G.is_directed():
        components = nx.strongly_connected_component_subgraphs(G)
    else:
        components = nx.connected_component_subgraphs(G)
    core = max(components, key=len)
    
    if core.number_of_nodes() <= N_sample:
        sampled_nodes = list(core.nodes)
    else:
        sampled_nodes = random.sample(list(core.nodes), N_sample)
    
    with multiprocessing.Pool() as pool:
        args = ((core, node) for node in sampled_nodes)
        node_apls = pool.starmap(single_source_average_path_length, args)
    
    return np.average(node_apls)

In [8]:
updates = []
for idx, row in df.iterrows():
    fname = row['File']
    print(idx)
    if 'graphml' in fname:
        G = nx.read_graphml(fname)
    else:
        graph_class = nx.DiGraph() if 'D' in row['Type'] else nx.Graph()
        data_spec = [('weight', float)] if 'W' in row['Type'] else False
        G = nx.read_edgelist(fname, create_using=graph_class, data=data_spec)
        
    N = G.number_of_nodes()
    
    if G.is_multigraph():
        MG = G
        G = nx.DiGraph() if MG.is_directed() else nx.Graph()
        G.add_edges_from((u,v) for u,v,i in MG.edges)
  
    L = G.number_of_edges()
        
    degree_view = G.in_degree if G.is_directed() else G.degree
    degrees = np.array([d for n,d in degree_view])
    
    cc = nx.average_clustering(G.to_undirected())
    
    # This corresponds to the definition of heterogeneity given in the book.
    # Note that this definition may differ from that from other sources.
    heterogeneity = (degrees ** 2).mean() / degrees.mean() ** 2
    
    updates.append({
        'Name': idx,
        'Nodes': N,
        'Links': L,
        'Density': nx.density(G),
        'Average Degree': degrees.mean(),
        'Maximum Degree': degrees.max(),
        'Heterogeneity': heterogeneity,
        'Clustering': cc,
        'Average Path Length': estimated_average_path_length(G, N_sample=1000),
    })
    

Facebook Northwestern University
IMDB movies and actors
IMDB actors costar
Twitter US politics
Enron Email
Enron Executive Email
Wikipedia math
Internet routers
US air transportation
World air transportation
Yeast protein interactions
C. elegans brain
Everglades ecological food web


In [9]:
new_df = pd.DataFrame.from_records(updates, index='Name')

In [10]:
df = df.join(new_df)

In [11]:
df.head()

Unnamed: 0_level_0,Type,File,Average Degree,Average Path Length,Clustering,Density,Heterogeneity,Links,Maximum Degree,Nodes
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Facebook Northwestern University,,socfb-Northwestern25/socfb-Northwestern25.edge...,92.4268,2.71852,0.237991,0.008748,1.835828,488337,2105,10567
IMDB movies and actors,,imdb/actors_movies.edges.gz,3.269754,12.148362,0.0,6e-06,5.406334,921160,800,563443
IMDB actors costar,W,imdb/actors_costar.edges.gz,8.025225,6.849723,0.66756,3.2e-05,4.570261,1015187,456,252999
Twitter US politics,DW,icwsm_polarization/retweet-digraph.edges.gz,2.618571,5.585333,0.026153,0.000142,8.298196,48365,204,18470
Enron Email,DW,email-Enron/email-Enron.edges.gz,3.688632,3.613747,0.119342,4.2e-05,17.436859,321918,1338,87273


## Chapter 1
* Nodes
* Links
* Density
* Average degree

In [12]:
ch1 = df[['Nodes', 'Links', 'Density', 'Average Degree']]
ch1

Unnamed: 0_level_0,Nodes,Links,Density,Average Degree
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Facebook Northwestern University,10567,488337,0.008748,92.4268
IMDB movies and actors,563443,921160,6e-06,3.269754
IMDB actors costar,252999,1015187,3.2e-05,8.025225
Twitter US politics,18470,48365,0.000142,2.618571
Enron Email,87273,321918,4.2e-05,3.688632
Enron Executive Email,143,623,0.061361,8.713287
Wikipedia math,15220,194103,0.000838,12.753154
Internet routers,190914,607610,3.3e-05,6.365274
US air transportation,546,2781,0.018691,10.186813
World air transportation,3179,18617,0.003685,11.712488


In [13]:
print(ch1.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   Nodes &    Links &   Density &  Average Degree \\
Name                             &         &          &           &                 \\
\midrule
Facebook Northwestern University &   10567 &   488337 &  0.008748 &       92.426800 \\
IMDB movies and actors           &  563443 &   921160 &  0.000006 &        3.269754 \\
IMDB actors costar               &  252999 &  1015187 &  0.000032 &        8.025225 \\
Twitter US politics              &   18470 &    48365 &  0.000142 &        2.618571 \\
Enron Email                      &   87273 &   321918 &  0.000042 &        3.688632 \\
Enron Executive Email            &     143 &      623 &  0.061361 &        8.713287 \\
Wikipedia math                   &   15220 &   194103 &  0.000838 &       12.753154 \\
Internet routers                 &  190914 &   607610 &  0.000033 &        6.365274 \\
US air transportation            &     546 &     2781 &  0.018691 &       10.186813 \\
World air transportation        

# Chapter 2

* Nodes
* Links
* APL
* CC

In [14]:
ch2 = df[['Nodes', 'Links', 'Clustering', 'Average Path Length']]
ch2

Unnamed: 0_level_0,Nodes,Links,Clustering,Average Path Length
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Facebook Northwestern University,10567,488337,0.237991,2.71852
IMDB movies and actors,563443,921160,0.0,12.148362
IMDB actors costar,252999,1015187,0.66756,6.849723
Twitter US politics,18470,48365,0.026153,5.585333
Enron Email,87273,321918,0.119342,3.613747
Enron Executive Email,143,623,0.433907,2.946257
Wikipedia math,15220,194103,0.30744,3.91979
Internet routers,190914,607610,0.158173,7.009731
US air transportation,546,2781,0.493045,3.191425
World air transportation,3179,18617,0.490542,3.950245


In [15]:
print(ch2.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   Nodes &    Links &  Clustering &  Average Path Length \\
Name                             &         &          &             &                      \\
\midrule
Facebook Northwestern University &   10567 &   488337 &    0.237991 &             2.718520 \\
IMDB movies and actors           &  563443 &   921160 &    0.000000 &            12.148362 \\
IMDB actors costar               &  252999 &  1015187 &    0.667560 &             6.849723 \\
Twitter US politics              &   18470 &    48365 &    0.026153 &             5.585333 \\
Enron Email                      &   87273 &   321918 &    0.119342 &             3.613747 \\
Enron Executive Email            &     143 &      623 &    0.433907 &             2.946257 \\
Wikipedia math                   &   15220 &   194103 &    0.307440 &             3.919790 \\
Internet routers                 &  190914 &   607610 &    0.158173 &             7.009731 \\
US air transportation            &     546 &     

## Chapter 3
* Nodes
* Links
* Density
* Average degree
* Maximum degree
* Heterogeneity

In [16]:
ch3 = df[['Nodes', 'Links', 'Average Degree', 'Maximum Degree', 'Heterogeneity']]
ch3

Unnamed: 0_level_0,Nodes,Links,Average Degree,Maximum Degree,Heterogeneity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Facebook Northwestern University,10567,488337,92.4268,2105,1.835828
IMDB movies and actors,563443,921160,3.269754,800,5.406334
IMDB actors costar,252999,1015187,8.025225,456,4.570261
Twitter US politics,18470,48365,2.618571,204,8.298196
Enron Email,87273,321918,3.688632,1338,17.436859
Enron Executive Email,143,623,8.713287,42,1.482948
Wikipedia math,15220,194103,12.753154,5171,38.210205
Internet routers,190914,607610,6.365274,1071,5.963275
US air transportation,546,2781,10.186813,153,5.347495
World air transportation,3179,18617,11.712488,246,5.509073


In [17]:
print(ch3.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &   Nodes &    Links &  Average Degree &  Maximum Degree &  Heterogeneity \\
Name                             &         &          &                 &                 &                \\
\midrule
Facebook Northwestern University &   10567 &   488337 &       92.426800 &            2105 &       1.835828 \\
IMDB movies and actors           &  563443 &   921160 &        3.269754 &             800 &       5.406334 \\
IMDB actors costar               &  252999 &  1015187 &        8.025225 &             456 &       4.570261 \\
Twitter US politics              &   18470 &    48365 &        2.618571 &             204 &       8.298196 \\
Enron Email                      &   87273 &   321918 &        3.688632 &            1338 &      17.436859 \\
Enron Executive Email            &     143 &      623 &        8.713287 &              42 &       1.482948 \\
Wikipedia math                   &   15220 &   194103 &       12.753154 &            5171 &      38.210205 \

In [18]:
df.to_csv('summary_statistics.csv')