### Network analysis of scientific collaboration

In [1]:
# Libraries

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle, chain

import networkx as nx
from networkx.algorithms import community

AMiner-Coauthor

In [2]:
# Load the data from AMiner-Coauthor

df = pd.read_csv('AMiner-Coauthor.txt', delimiter = "\t", header = None, names=["source", "target", "weight"])
print('Coauthor dataset dimension:', df.shape)
df.head()

Coauthor dataset dimension: (4258946, 3)


Unnamed: 0,source,target,weight
0,#522324,1034146,1
1,#1355779,1229932,2
2,#688814,947067,2
3,#1329221,1140429,1
4,#742331,314944,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4258946 entries, 0 to 4258945
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   source  object
 1   target  int64 
 2   weight  int64 
dtypes: int64(2), object(1)
memory usage: 97.5+ MB


In [4]:
# Change the column 'source'
df["source"] = (df["source"].str.strip('#').astype(int))

In [5]:
df["weight"].value_counts()

1     3288462
2      553011
3      185736
4       85127
5       46816
       ...   
89          1
87          1
66          1
83          1
86          1
Name: weight, Length: 114, dtype: int64

In [6]:
# To check if there are any NaN values
df.isnull().values.any()

False

In [7]:
# To check if there are any duplicated rows
df.duplicated().sum()

0

In [8]:
# To check if any pair of authors was mentioned twice
df[df.duplicated(['target', 'source'])]

Unnamed: 0,source,target,weight


AMiner-Author

In [9]:
# Load the data from AMiner-Author

# Source of this function for preprocessing https://www.kaggle.com/kmader/preprocessing-author-data

with open(os.path.join('AMiner-Author.txt'), 'r', errors='ignore') as f:
    dict_list = []
    c_dict = {}
    for i, line in enumerate(f):
        c_line = line.strip()[1:].strip()
        if len(c_line)<1:
            if len(c_dict)>0:
                dict_list += [c_dict]
            c_dict = {}
        else:
            c_frag = c_line.split(' ')
            c_dict[c_frag[0]] = ' '.join(c_frag[1:])

In [10]:
# Load the data

author_df = pd.DataFrame(dict_list)
print('Author dataset dimension:', author_df.shape)
author_df.head()

Author dataset dimension: (1712433, 11)


Unnamed: 0,index,n,a,pc,cn,hi,pi,upi,t,n-2nd,"Valladolid,"
0,1,O. Willum,"Res. Center for Microperipherik, Technische Un...",1,0,0,0.0,0.0,new product;product group;active product;long ...,,
1,2,D. Wei,"Dept. of Electr. & Comput. Eng., Drexel Univ.,...",1,0,0,0.0,0.0,lowpass filter;multidimensional product filter...,,
2,3,Wenhu Wu,,1,0,0,0.0,0.0,MAP adaptation;adaptation data;adaptation utte...,,
3,4,Zhiyuan Zeng,"College of Geography Science, Nanjing Normal U...",1,0,0,0.0,0.0,normalized difference vegetation index;ratio v...,,
4,5,Erzen Hyko,"Department of Computer Science, University of ...",1,0,0,0.0,0.0,information content;Spatio-Temporal Informatio...,,


In [11]:
# Rename the columns
author_df.rename({'a': 'affiliation',
                 'n': 'author', 
                  'index':'id',
                 'pc': 'papers',
                 'cn': 'citations',
                  'hi': 'h_id',
                  'pi':'p_id',
                  'upi':'up_id',
                  't': 'research'
                 }, axis=1, inplace=True)

# Drop the columns n-2nd and Valladolid,
author_df = author_df[['id', 'author', 'affiliation', 'papers', 'citations', 'h_id', 'p_id', 'up_id','research']]
author_df.head()

Unnamed: 0,id,author,affiliation,papers,citations,h_id,p_id,up_id,research
0,1,O. Willum,"Res. Center for Microperipherik, Technische Un...",1,0,0,0.0,0.0,new product;product group;active product;long ...
1,2,D. Wei,"Dept. of Electr. & Comput. Eng., Drexel Univ.,...",1,0,0,0.0,0.0,lowpass filter;multidimensional product filter...
2,3,Wenhu Wu,,1,0,0,0.0,0.0,MAP adaptation;adaptation data;adaptation utte...
3,4,Zhiyuan Zeng,"College of Geography Science, Nanjing Normal U...",1,0,0,0.0,0.0,normalized difference vegetation index;ratio v...
4,5,Erzen Hyko,"Department of Computer Science, University of ...",1,0,0,0.0,0.0,information content;Spatio-Temporal Informatio...


In [12]:
# To check if there are any duplicated rows
author_df.duplicated().sum()

0

In [13]:
# Check the types of data
author_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1712433 entries, 0 to 1712432
Data columns (total 9 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   id           object
 1   author       object
 2   affiliation  object
 3   papers       object
 4   citations    object
 5   h_id         object
 6   p_id         object
 7   up_id        object
 8   research     object
dtypes: object(9)
memory usage: 117.6+ MB


In [14]:
# Change the data type of columns 'id', 'papers', 'citations', 'h_id', 'p_id', 'up_id'

author_df[['id', 'papers', 'citations', 'h_id', 'p_id', 'up_id']] = author_df[['id', 'papers', 'citations', 'h_id', 'p_id', 'up_id']].apply(pd.to_numeric)

In [15]:
# To check for missing values
print('The number of missing values in "Author"',author_df["author"].isnull().sum())
print('The number of missing values in "Papers"',author_df["papers"].isnull().sum())
print('The number of missing values in "Citations"',author_df["citations"].isnull().sum())
print('The number of missing values in "Index"',author_df["id"].isnull().sum())

The number of missing values in "Author" 0
The number of missing values in "Papers" 0
The number of missing values in "Citations" 0
The number of missing values in "Index" 0


In [16]:
# for nodes change the index
nodes_df = author_df.set_index(['id'])
nodes_df.head()

Unnamed: 0_level_0,author,affiliation,papers,citations,h_id,p_id,up_id,research
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,O. Willum,"Res. Center for Microperipherik, Technische Un...",1,0,0,0.0,0.0,new product;product group;active product;long ...
2,D. Wei,"Dept. of Electr. & Comput. Eng., Drexel Univ.,...",1,0,0,0.0,0.0,lowpass filter;multidimensional product filter...
3,Wenhu Wu,,1,0,0,0.0,0.0,MAP adaptation;adaptation data;adaptation utte...
4,Zhiyuan Zeng,"College of Geography Science, Nanjing Normal U...",1,0,0,0.0,0.0,normalized difference vegetation index;ratio v...
5,Erzen Hyko,"Department of Computer Science, University of ...",1,0,0,0.0,0.0,information content;Spatio-Temporal Informatio...


In [17]:
# to later add attributes to nodes
nodes_attributes = nodes_df.to_dict('index')

In [18]:
# To create a network

G = nx.from_pandas_edgelist(df, source='source', target='target', edge_attr=True)
nx.set_node_attributes(G, nodes_attributes)
nx.info(G)

'Name: \nType: Graph\nNumber of nodes: 1560640\nNumber of edges: 4258946\nAverage degree:   5.4579'

In [19]:
# To check if the network is connected
print(nx.is_connected(G))

False


In [20]:
# Check the number of connected components in the network
nx.number_connected_components(G)

156240

In [21]:
node_sets = list(nx.connected_components(G))
nn = []
for i in node_sets:
    nn.append(len(i))

In [23]:
# The first and second largest components
nn.sort(reverse=True)
print(nn[0])
print(nn[1])

1057194
83


In [None]:
plt.figure(figsize=(25,25))
nx.draw_networkx(G_w)
plt.show()

In [None]:
nx.density(G)

In [None]:
nx.average_clustering(G, weight = 'weight')

In [None]:
degree_centrality = nx.degree_centrality(G)

#Sort for identifying most inflential nodes using degree centrality
for node in sorted(degree_centrality, key=degree_centrality.get, reverse=True)[:4]:
    print(node,G.nodes[node]['author'], degree_centrality[node])

In [None]:
G_distance_dict = {(e1, e2): 1 / weight for e1, e2, weight in G.edges(data='weight')}
# set_edge_attributes(G, name, values)
nx.set_edge_attributes(G,G_distance_dict, 'distance')
closeness_centrality = nx.closeness_centrality(G, distance='distance')

for node in sorted(closeness_centrality, key=closeness_centrality.get, reverse=True)[:4]:
    print(node,G.nodes[node]['author'], closeness_centrality[node])

In [None]:
eigenvector_centrality_weighted = nx.eigenvector_centrality(G, weight='weight')

for node in sorted(eigenvector_centrality_weighted, key=eigenvector_centrality_weighted.get, reverse=True)[:4]:
    print(node, G.nodes[node]['author'], eigenvector_centrality_weighted[node])