In [1]:
import pandas as pd
import numpy as np
import networkx as nx

In [2]:
node_df = pd.read_csv('./data/cociteNodes.csv')
edge_df = pd.read_csv('./data/cociteEdges.csv')
print(node_df[:5])
print(edge_df[-5:-1])

      x     y  size  PubYear  HierCat  NatureID  \
0 -1247  1345   5.3     1900        1  062366b0   
1  -284  1083   4.6     1900       10  062340b0   
2   822   346   7.9     1900        1  062620e0   
3 -1316  1279   4.6     1900       10  062149b0   
4   587  1598   4.0     1900        4  062523e0   

                                               Title  
0                            Atmospheric Electricity  
1  The Conductivity produced in Gases by the Moti...  
2                     Albinism and Natural Selection  
3                            Atmospheric Electricity  
4                        Leaf Decay and Autumn Tints  
          source     target path
239616  144926a0   276067a0  NaN
239617  150267a0   163874b0  NaN
239618  342275a0   342553a0  NaN
239619  268218a0  2171136a0  NaN


In [3]:
idx_node_df = node_df.reset_index()

## Transform to NetworkX Format

In [4]:
G = nx.DiGraph()
nID2index = {}
idx = 0
for row in idx_node_df.iterrows():
    obj = row[1]
    G.add_node(row[0], id=obj['NatureID'], pubYear=obj['PubYear'], size=obj['size'], title=obj['Title'])
    nID2index[obj['NatureID']] = row[0]

In [5]:
print(nID2index['062366b0'])
print(G.nodes[nID2index['062366b0']]['id'])

0
062366b0


In [6]:
for row in edge_df.iterrows():
    source = nID2index[row[1]['source']]
    target = nID2index[row[1]['target']]
    G.add_edge(source, target)

In [7]:
in_degree = G.in_degree()
out_degree = G.out_degree()
type(in_degree)

networkx.classes.reportviews.InDegreeView

In [8]:
citation = [e[1] for e in in_degree]
reference = [e[1] for e in out_degree]

In [9]:
citation = np.array(citation)
reference = np.array(reference)
print('The max number of citation: ', citation.max())
print('The max number of reference: ', reference.max())
print('The median number of citation: ', np.median(citation))
print('The median number of reference: ', np.median(reference))
print('The average number of edge per node:', citation.mean() )
(citation > 5).sum()

The max number of citation:  78
The max number of reference:  77
The median number of citation:  2.0
The median number of reference:  2.0
The average number of edge per node: 2.714267914184092


11533

- Only choose the paper with over 5 citation in this paper collection

In [10]:
hCite = [e[0] for e in in_degree if e[1] > 5]

In [11]:
hCite[:10]

[7, 23, 31, 58, 72, 87, 92, 112, 115, 133]

In [12]:
SG = G.subgraph(hCite)
print(len(SG.nodes()))
print(len(SG.edges()))

# we do not care about isolated nodes
node_list = [e[0] for e in SG.degree() if e[1] != 0]
len(node_list)

11533
35693


10389

In [13]:
positiveSG = SG.subgraph(node_list)
data = nx.node_link_data(positiveSG)

In [14]:
import json

fp = open('natureH6.json', 'w')
json.dump(data, fp)
fp.close()
