In [2]:
import pandas as pd
import numpy as np
import networkx as nx

In [3]:
node_df = pd.read_csv('./data/cociteNodes.csv')
edge_df = pd.read_csv('./data/cociteEdges.csv')
print(node_df[:5])
print(edge_df[-5:-1])
node_df[:5]

      x     y  size  PubYear  HierCat  NatureID  \
0 -1247  1345   5.3     1900        1  062366b0   
1  -284  1083   4.6     1900       10  062340b0   
2   822   346   7.9     1900        1  062620e0   
3 -1316  1279   4.6     1900       10  062149b0   
4   587  1598   4.0     1900        4  062523e0   

                                               Title  
0                            Atmospheric Electricity  
1  The Conductivity produced in Gases by the Moti...  
2                     Albinism and Natural Selection  
3                            Atmospheric Electricity  
4                        Leaf Decay and Autumn Tints  
          source     target path
239616  144926a0   276067a0  NaN
239617  150267a0   163874b0  NaN
239618  342275a0   342553a0  NaN
239619  268218a0  2171136a0  NaN


Unnamed: 0,x,y,size,PubYear,HierCat,NatureID,Title
0,-1247,1345,5.3,1900,1,062366b0,Atmospheric Electricity
1,-284,1083,4.6,1900,10,062340b0,The Conductivity produced in Gases by the Moti...
2,822,346,7.9,1900,1,062620e0,Albinism and Natural Selection
3,-1316,1279,4.6,1900,10,062149b0,Atmospheric Electricity
4,587,1598,4.0,1900,4,062523e0,Leaf Decay and Autumn Tints


In [4]:
idx_node_df = node_df.reset_index()

## Transform to NetworkX Format

In [5]:
G = nx.DiGraph()
nID2index = {}
idx = 0
for row in idx_node_df.iterrows():
    obj = row[1]
    G.add_node(row[0], id=obj['NatureID'], pubYear=obj['PubYear'], size=obj['size'], title=obj['Title'])
    nID2index[obj['NatureID']] = row[0]

In [6]:
print(nID2index['062366b0'])
print(G.nodes[nID2index['062366b0']]['id'])

0
062366b0


In [7]:
for row in edge_df.iterrows():
    source = nID2index[row[1]['source']]
    target = nID2index[row[1]['target']]
    G.add_edge(source, target)

In [8]:
in_degree = G.in_degree()
out_degree = G.out_degree()
type(in_degree)

networkx.classes.reportviews.InDegreeView

In [9]:
import numpy as np
np.array([ c for i, c in nx.clustering(G).items()]).mean()

0.17292525539405695

In [10]:
G1 = G.to_undirected()
graphs = list(nx.connected_component_subgraphs(G1))

AttributeError: module 'networkx' has no attribute 'connected_component_subgraphs'

In [None]:
graphs.sort(key=len)

In [None]:
len(graphs[-1].nodes())

In [11]:
citation = [e[1] for e in in_degree]
reference = [e[1] for e in out_degree]

In [12]:
citation = np.array(citation)
reference = np.array(reference)
print('The max number of citation: ', citation.max())
print('The max number of reference: ', reference.max())
print('The median number of citation: ', np.median(citation))
print('The median number of reference: ', np.median(reference))
print('The average number of edge per node:', citation.mean() )
(citation > 1).sum()

The max number of citation:  78
The max number of reference:  77
The median number of citation:  2.0
The median number of reference:  2.0
The average number of edge per node: 2.714267914184092


46894

- Only choose the paper with over 5 citation in this paper collection

In [13]:
hCite = [e[0] for e in in_degree if e[1] > 4]

In [14]:
hCite[:10]

[2, 7, 23, 31, 58, 61, 72, 74, 87, 88]

In [15]:
SG = G.subgraph(hCite)
print(len(SG.nodes()))
print(len(SG.edges()))

# we do not care about isolated nodes
node_list = [e[0] for e in SG.degree() if e[1] != 0]
len(node_list)
max_component = max(nx.connected_component_subgraphs(SG.to_undirected()), key=len)
len(max(nx.connected_component_subgraphs(SG.to_undirected()), key=len).nodes())

16025
46754


AttributeError: module 'networkx' has no attribute 'connected_component_subgraphs'

In [None]:
positiveSG = SG.subgraph(node_list)
max_component = max(nx.connected_component_subgraphs(positiveSG.to_undirected()), key=len)
data = nx.node_link_data(max_component)

In [None]:
import json

fp = open('natureH5Component.json', 'w')
json.dump(data, fp)
fp.close()
