In [1]:
import csv
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
from multiprocessing import Pool
import time
import itertools

In [3]:
def chunks(l, n):
    """Divide a list of nodes `l` in `n` chunks"""
    l_c = iter(l)
    while 1:
        x = tuple(itertools.islice(l_c, n))
        if not x:
            return
        yield x


def _betmap(G_normalized_weight_sources_tuple):
    """Pool for multiprocess only accepts functions with one argument.
    This function uses a tuple as its only argument. We use a named tuple for
    python 3 compatibility, and then unpack it when we send it to
    `betweenness_centrality_source`
    """
    return nx.betweenness_centrality_source(*G_normalized_weight_sources_tuple)


def betweenness_centrality_parallel(G, processes=None):
    """Parallel betweenness centrality  function"""
    p = Pool(processes=processes)
    node_divisor = len(p._pool) * 4
    node_chunks = list(chunks(G.nodes(), int(G.order() / node_divisor)))
    num_chunks = len(node_chunks)
    bt_sc = p.map(_betmap,
                  zip([G] * num_chunks,
                      [True] * num_chunks,
                      [None] * num_chunks,
                      node_chunks))

    # Reduce the partial solutions
    bt_c = bt_sc[0]
    for bt in bt_sc[1:]:
        for n in bt:
            bt_c[n] += bt[n]
    return bt_c


## Data Import

In [4]:
df = pd.read_csv('name_name.csv', dtype={"reps": np.int64, "comm" : str, "auth" : str})


In [5]:
df.head()

Unnamed: 0,reps,auth,comm
0,26,Maria Shaldibina,Stev Witzel
1,17,Eduardo Valentin,Daniel
2,24,Eric W. Biederman,geiti94
3,519,Paul Jakma,Paul Jakma
4,122,Laxman Dewangan,Stephen Warren


## Data Cleaning

In [6]:
df.isnull().values.sum()

345

In [7]:
nan_rows = df[df.isnull().any(1)]
nan_rows

Unnamed: 0,reps,auth,comm
2295,225,,root
5867,24,,mark
12673,18,,舒志凌
13649,55,,Jeff Garzik
18747,21,John Axel Eriksson,
23358,21,Fish,
25165,22,,黄志伟
27935,477,,git-darcs-import
29367,18,Lars van de Kerkhof,
29690,27,,Ease


In [8]:
df = df.dropna()
df.isnull().values.sum()

0

## Sample Graph

In [9]:
#dfn = df.sample(1000)
#del df

In [10]:
#G = nx.DiGraph(directed=True)
#G.add_edge(1, 3, weight=8)
#G.add_edge(3, 1, weight=1)
#G.add_edge(3, 4, weight=6)
#G.add_edge(1, 2, weight=1)
#G.add_edge(2, 4, weight=3)

#pos=nx.spring_layout(G) 
#nx.draw(G,pos)
#labels = nx.get_edge_attributes(G,'weight')
#nx.draw_networkx_edge_labels(G,pos,edge_labels=labels)

In [11]:
graph = nx.from_pandas_edgelist(df, source = 'comm', target = 'auth', edge_attr = 'reps',create_using = nx.DiGraph())

In [12]:
#del dfn

In [15]:
number_of_nodes = nx.number_of_nodes(graph)

pos=nx.spring_layout(graph) 
nx.draw(graph,pos)
labels = nx.get_edge_attributes(graph,'weight')
nx.draw_networkx_edge_labels(graph,pos,edge_labels=labels)

## Centrality

In [None]:
print("Betweenness")
b = nx.betweenness_centrality(graph, k=int(0.01*number_of_nodes))
#b = betweenness_centrality_parallel(graph, 9)

Betweenness


In [None]:
print("Degree centrality")
d = nx.degree_centrality(graph)

print("Closeness centrality")
c = nx.closeness_centrality(graph, k=int(0.01*number_of_nodes))

print("Eigenvector centrality")
pg = nx.pagerank(graph, alpha=0.85, personalization=None, max_iter=100, tol=1e-06, nstart=None, weight='reps', dangling=None)

print("HITS)"
hub, auths = nx.hits(graph, max_iter=100, tol=1e-08, nstart=None, normalized=True)

      

print("Diameter")
largest = max(nx.connected_components(graph.to_undirected()), key=len)
g = graph.to_undirected()
removal = set(g.nodes) - largest
g.remove_nodes_from(removal)
dia = nx.diameter(g)

print("Clustering")
avg_clust = nx.average_clustering(g, weight = 'weight')
clustering = dict(nx.clustering(g))

## Top X in each Centrality

In [None]:
x = 5
from heapq import nlargest
sb = nlargest(x, b, key=b.get)
sc = nlargest(x, c, key=c.get)
sd = nlargest(x, d, key=d.get)
spg = nlargest(x, pg, key=pg.get)
shubs = nlargest(x, hub, key=hub.get)
sauths = nlargest(x, auths, key=auths.get)

In [None]:
print("Betweenness", sb)
print("Closeness", sc)
print("Degree", sd)
print("Eigen", spg)
print("Hubs", shubs)
print("Authorities", sauths)
#print("Diameter of largest component", dia)
#print("Average clustering coefficient ", avg_clust)
#print("Clustering coefficient values :")
#print(clustering)

## Distribution

In [None]:
din = dict(graph.in_degree())
dout = dict(graph.out_degree())
#for item in sorted(d, key=d.get, reverse=True):
#    print (item, d[item])
#USE b, c, d

In [None]:
from collections import defaultdict
#din
counter_indegree = defaultdict(int)
for k, v in din.items():
    counter_indegree[v]+=1
    
#dout
counter_outdegree = defaultdict(int)
for k, v in dout.items():
    counter_outdegree[v]+=1
    
#b
counter_b = defaultdict(int)
for k, v in b.items():
    counter_b[v]+=1
    
#c
counter_c = defaultdict(int)
for k, v in c.items():
    counter_c[v]+=1

#d
counter_d = defaultdict(int)
for k, v in d.items():
    counter_d[v]+=1
    
#pg
counter_pg = defaultdict(int)
for k, v in pg.items():
    counter_pg[v]+=1
    
#hubs and auths

counter_hubs = defaultdict(int)
for k, v in hub.items():
    counter_hubs[v]+=1
    
counter_auths = defaultdict(int)
for k, v in auths.items():
    counter_auths[v]+=1

In [None]:
counter_outdegree

In [None]:
#del counter_outdegree[283]

### In Degree

In [None]:
#plt.bar(counter_indegree.keys(), counter_indegree.values(), 0.05)
lists = sorted(counter_indegree.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.plot(x, y)
plt.savefig('dist_in_degree.png', bbox_inches='tight')
plt.show()


### Out Degree

In [None]:
#plt.bar(counter_outdegree.keys(), counter_outdegree.values(), 0.5)
lists = sorted(counter_outdegree.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.plot(x, y)
plt.savefig('dist_out_degree.png', bbox_inches='tight')
plt.show()

### Undirected Degree

In [None]:
## HITS, SAVE IMAGES
G2 = graph.to_undirected()
ddegree = dict(G2.degree())
counter_ddegree = defaultdict(int)
for k, v in ddegree.items():
    counter_ddegree[v]+=1

lists = sorted(counter_ddegree.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.plot(x, y)
plt.savefig('dist_und_degree.png', bbox_inches='tight')
plt.show()

del G2

In [None]:
counter_ddegree == counter_outdegree

### Betweeness Centrality

In [None]:
#plt.bar(counter_b.keys(), counter_b.values(), 0.05)
lists = sorted(counter_b.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.plot(x, y)
plt.savefig('dist_centrality_betweeness.png', bbox_inches='tight')
plt.show()

### Degree Centrality

In [None]:
#plt.bar(counter_d.keys(), counter_d.values(), 0.05)
lists = sorted(counter_d.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.savefig('dist_centrality_degree.png', bbox_inches='tight')
plt.plot(x, y)
plt.show()

### Closeness Centrality

In [None]:
#plt.bar(counter_c.keys(), counter_c.values(), 0.05)
lists = sorted(counter_c.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.savefig('dist_centrality_closeness.png', bbox_inches='tight')
plt.plot(x, y)
plt.show()

### Eigen-vector (Page Rank) Centrality

In [None]:
lists = sorted(counter_pg.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.savefig('dist_centrality_pagerank.png', bbox_inches='tight')
plt.plot(x, y)
plt.show()

### Hubs and Authorities values

In [None]:
lists = sorted(counter_hubs.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.savefig('dist_centrality_hubs.png', bbox_inches='tight')
plt.plot(x, y)
plt.show()

In [None]:
x, y

In [None]:
lists = sorted(counter_auths.items()) # sorted by key, return a list of tuples
x, y = zip(*lists) # unpack a list of pairs into two tuples
plt.figure(figsize=(20,10))
plt.savefig('dist_centrality_auths.png', bbox_inches='tight')
plt.plot(x, y)
plt.show()

In [None]:
counter_auths