In [1]:
import itertools
import random

import networkx as nx
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame([
        ('Facebook Northwestern University', '', 'socfb-Northwestern25/socfb-Northwestern25.edges.gz'),
        ('IMDB movies and actors', '', 'imdb/actors_movies.edges.gz'),
        ('IMDB actors costar', 'W', 'imdb/actors_costar.edges.gz'),
        ('Twitter US politics', 'DW', 'icwsm_polarization/retweet-digraph.edges.gz'),
        ('Enron Email', 'DW', 'email-Enron/email-Enron.edges.gz'),
        ('Enron Executive Email', '', 'ia-enron-only/ia-enron-only.edges'),
        ('Wikipedia math', 'D', 'enwiki_math/enwiki_math.edges.gz'),
        ('Internet routers', 'W', 'tech-RL-caida/tech-RL-caida.edges.gz'),
        ('US air transportation', '', 'openflights/openflights_usa.edges.gz'),
        ('World air transportation', '', 'openflights/openflights_world.edges.gz'),
        ('Yeast protein interactions', '', 'bio-yeast-protein-inter/bio-yeast-protein-inter.edges'),
        ('C. elegans brain', 'DW', 'celegansneural/celegansneural.edges'),
        ('Everglades ecological food web', 'DW', 'eco-everglades/eco-everglades.edges'),
    ],
    columns=['Name', 'Type', 'File'],
)

In [3]:
df = df.set_index('Name')

In [4]:
df.head()

Unnamed: 0_level_0,Type,File
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Facebook Northwestern University,,socfb-Northwestern25/socfb-Northwestern25.edge...
IMDB movies and actors,,imdb/actors_movies.edges.gz
IMDB actors costar,W,imdb/actors_costar.edges.gz
Twitter US politics,DW,icwsm_polarization/retweet-digraph.edges.gz
Enron Email,DW,email-Enron/email-Enron.edges.gz


In [5]:
def sample_edges(pairs, N):
    N = int(N)
    if N >= G.number_of_edges():
        return list(G.edges)
    else:
        indexes = sorted(random.sample(range(G.number_of_edges()), N), reverse=True)
        edges = []
        for idx, edge in enumerate(G.edges):
            if not indexes:
                break
            if indexes[-1] == idx:
                edges.append(edge)
                indexes.pop()
        return edges

def sample_pairs(G, N):
    N = int(N)
    number_of_pairs = int(G.number_of_nodes() * (G.number_of_nodes() - 1) / 2)
    pairs = itertools.combinations(G.nodes, 2)
    if N >= number_of_pairs:
        return list(pairs)
    else:
        indexes = sorted(random.sample(range(number_of_pairs), N), reverse=True)
        sample = []
        for idx, pair in enumerate(pairs):
            if not indexes:
                break
            if indexes[-1] == idx:
                sample.append(pair)
                indexes.pop()
        return sample

In [6]:
updates = []
for idx, row in df.iterrows():
    fname = row['File']
    print(idx)
    if 'graphml' in fname:
        G = nx.read_graphml(fname)
    else:
        graph_class = nx.DiGraph() if 'D' in row['Type'] else nx.Graph()
        data_spec = [('weight', float)] if 'W' in row['Type'] else False
        G = nx.read_edgelist(fname, create_using=graph_class, data=data_spec)
        
    N = G.number_of_nodes()
    
    if isinstance(G, nx.MultiGraph):
        MG = G
        G = nx.DiGraph() if isinstance(MG, nx.DiGraph) else nx.Graph()
        G.add_edges_from((u,v) for u,v,i in MG.edges)
  
    L = G.number_of_edges()
        
    degree_view = G.in_degree if isinstance(G, nx.DiGraph) else G.degree
    degrees = np.array([d for n,d in degree_view])
    
    cc = nx.average_clustering(G.to_undirected())
    # num_nodes_in_sample = min(10**4, core.number_of_nodes())
    # sampled_nodes = random.sample(G.to_undirected().nodes, num_nodes_in_sample)
    # cc = np.average([nx.clustering(G.to_undirected(), n) for n in sampled_nodes])
    
    # apl = nx.average_shortest_path_length(core)
    
    core = max(nx.connected_component_subgraphs(G.to_undirected()), key=len)
    # pair sampling
    # sampled_pairs = sample_pairs(core, 200)
    # apl = np.average([nx.shortest_path_length(core, *edge) for edge in sampled_pairs])
    # vertex sampling
    N_sample = 100
    sampled_nodes = (list(core.nodes) if core.number_of_nodes() <= N_sample
                     else random.sample(list(core.nodes), N_sample))
    apl = np.average([np.average(list(nx.single_source_shortest_path_length(core, n).values()))
                      for n in sampled_nodes])

    updates.append({
        'Name': idx,
        'Nodes': N,
        'Links': L,
        'Density': nx.density(G),
        'Average Degree': degrees.mean(),
        'Maximum Degree': degrees.max(),
        'Heterogeneity': (degrees ** 2).mean() / degrees.mean(),
        'Clustering': cc,
        'Average Path Length': apl,
    })
    

Facebook Northwestern University
IMDB movies and actors
IMDB actors costar
Twitter US politics
Enron Email
Enron Executive Email
Wikipedia math
Internet routers
US air transportation
World air transportation
Yeast protein interactions
C. elegans brain
Everglades ecological food web


In [7]:
new_df = pd.DataFrame.from_records(updates, index='Name')

In [8]:
df = df.join(new_df)

In [9]:
df.head()

Unnamed: 0_level_0,Type,File,Average Degree,Average Path Length,Clustering,Density,Heterogeneity,Links,Maximum Degree,Nodes
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Facebook Northwestern University,,socfb-Northwestern25/socfb-Northwestern25.edge...,92.4268,2.731725,0.237991,0.008748,169.679746,488337,2105,10567
IMDB movies and actors,,imdb/actors_movies.edges.gz,3.269754,12.310622,0.0,6e-06,17.677382,921160,800,563443
IMDB actors costar,W,imdb/actors_costar.edges.gz,8.025225,6.843002,0.66756,3.2e-05,36.677378,1015187,456,252999
Twitter US politics,DW,icwsm_polarization/retweet-digraph.edges.gz,2.618571,5.037075,0.026153,0.000142,21.729412,48365,204,18470
Enron Email,DW,email-Enron/email-Enron.edges.gz,3.688632,4.841075,0.119342,4.2e-05,64.318162,321918,1338,87273


## Chapter 1
* Nodes
* Links
* Density
* Average degree

In [10]:
ch1 = df[['Nodes', 'Links', 'Density', 'Average Degree']]
ch1

Unnamed: 0_level_0,Nodes,Links,Density,Average Degree
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Facebook Northwestern University,10567,488337,0.008748,92.4268
IMDB movies and actors,563443,921160,6e-06,3.269754
IMDB actors costar,252999,1015187,3.2e-05,8.025225
Twitter US politics,18470,48365,0.000142,2.618571
Enron Email,87273,321918,4.2e-05,3.688632
Enron Executive Email,143,623,0.061361,8.713287
Wikipedia math,15220,194103,0.000838,12.753154
Internet routers,190914,607610,3.3e-05,6.365274
US air transportation,546,2781,0.018691,10.186813
World air transportation,3179,18617,0.003685,11.712488


In [11]:
print(ch1.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   Nodes &    Links &   Density &  Average Degree \\
Name                             &         &          &           &                 \\
\midrule
Facebook Northwestern University &   10567 &   488337 &  0.008748 &       92.426800 \\
IMDB movies and actors           &  563443 &   921160 &  0.000006 &        3.269754 \\
IMDB actors costar               &  252999 &  1015187 &  0.000032 &        8.025225 \\
Twitter US politics              &   18470 &    48365 &  0.000142 &        2.618571 \\
Enron Email                      &   87273 &   321918 &  0.000042 &        3.688632 \\
Enron Executive Email            &     143 &      623 &  0.061361 &        8.713287 \\
Wikipedia math                   &   15220 &   194103 &  0.000838 &       12.753154 \\
Internet routers                 &  190914 &   607610 &  0.000033 &        6.365274 \\
US air transportation            &     546 &     2781 &  0.018691 &       10.186813 \\
World air transportation        

# Chapter 2

* Nodes
* Links
* APL
* CC

In [12]:
ch2 = df[['Nodes', 'Links', 'Clustering', 'Average Path Length']]
ch2

Unnamed: 0_level_0,Nodes,Links,Clustering,Average Path Length
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Facebook Northwestern University,10567,488337,0.237991,2.731725
IMDB movies and actors,563443,921160,0.0,12.310622
IMDB actors costar,252999,1015187,0.66756,6.843002
Twitter US politics,18470,48365,0.026153,5.037075
Enron Email,87273,321918,0.119342,4.841075
Enron Executive Email,143,623,0.433907,2.919231
Wikipedia math,15220,194103,0.30744,2.773786
Internet routers,190914,607610,0.158173,7.072495
US air transportation,546,2781,0.493045,3.129314
World air transportation,3179,18617,0.490542,3.877527


In [13]:
print(ch2.to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   Nodes &    Links &  Clustering &  Average Path Length \\
Name                             &         &          &             &                      \\
\midrule
Facebook Northwestern University &   10567 &   488337 &    0.237991 &             2.731725 \\
IMDB movies and actors           &  563443 &   921160 &    0.000000 &            12.310622 \\
IMDB actors costar               &  252999 &  1015187 &    0.667560 &             6.843002 \\
Twitter US politics              &   18470 &    48365 &    0.026153 &             5.037075 \\
Enron Email                      &   87273 &   321918 &    0.119342 &             4.841075 \\
Enron Executive Email            &     143 &      623 &    0.433907 &             2.919231 \\
Wikipedia math                   &   15220 &   194103 &    0.307440 &             2.773786 \\
Internet routers                 &  190914 &   607610 &    0.158173 &             7.072495 \\
US air transportation            &     546 &     

## Chapter 3
* Nodes
* Links
* Density
* Average degree
* Maximum degree
* Heterogeneity

In [14]:
ch3 = df[['Nodes', 'Links', 'Average Degree', 'Maximum Degree', 'Heterogeneity']]
ch3

Unnamed: 0_level_0,Nodes,Links,Average Degree,Maximum Degree,Heterogeneity
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Facebook Northwestern University,10567,488337,92.4268,2105,169.679746
IMDB movies and actors,563443,921160,3.269754,800,17.677382
IMDB actors costar,252999,1015187,8.025225,456,36.677378
Twitter US politics,18470,48365,2.618571,204,21.729412
Enron Email,87273,321918,3.688632,1338,64.318162
Enron Executive Email,143,623,8.713287,42,12.921348
Wikipedia math,15220,194103,12.753154,5171,487.300614
Internet routers,190914,607610,6.365274,1071,37.957883
US air transportation,546,2781,10.186813,153,54.47393
World air transportation,3179,18617,11.712488,246,64.52495


In [15]:
print(ch3.to_latex())

\begin{tabular}{lrrrrr}
\toprule
{} &   Nodes &    Links &  Average Degree &  Maximum Degree &  Heterogeneity \\
Name                             &         &          &                 &                 &                \\
\midrule
Facebook Northwestern University &   10567 &   488337 &       92.426800 &            2105 &     169.679746 \\
IMDB movies and actors           &  563443 &   921160 &        3.269754 &             800 &      17.677382 \\
IMDB actors costar               &  252999 &  1015187 &        8.025225 &             456 &      36.677378 \\
Twitter US politics              &   18470 &    48365 &        2.618571 &             204 &      21.729412 \\
Enron Email                      &   87273 &   321918 &        3.688632 &            1338 &      64.318162 \\
Enron Executive Email            &     143 &      623 &        8.713287 &              42 &      12.921348 \\
Wikipedia math                   &   15220 &   194103 &       12.753154 &            5171 &     487.300614 \

In [16]:
df.to_csv('summary_statistics.csv')