In [2]:
import numpy as np
import pandas as pd
import networkx as nx
from scipy import stats as st
import matplotlib.pyplot as plt
G = nx.Graph()
# Given G as the graph

We use an undirected graph because human relationships are bi-directional in most cases. With an assumption that the author who comes first on a paper is the initiator of the relationship we would maybe have used a directed graph, but this is not a reasonable assumption.

In [None]:
print(f'There are {G.number_of_nodes()} and {G.number_of_edges()} in the graph with resulting density of {nx.density(G)}')

In [None]:
degrees = np.array([d for _, d in G.degree()])

print(f'Statistics for Degrees:\nMedian {degrees.median()}\nMean {degrees.mean()}\nMode {st.mode(degrees)}\nMin: {degrees.min()}\nMax: {degrees.max()}')

In [None]:
strengths = []
for node in G.nodes:
    strengths.append(sum([G.get_edge_data(*edge)['weight'] for edge in G.edges(node)]))
print(f'Statistics for Node Strengths:\nMedian {strengths.median()}\nMean {strengths.mean()}\nMode {st.mode(strengths)}\nMin: {strengths.min()}\nMax: {strengths.max()}')

In [None]:
plt.hist(degrees, nbins=100)
plt.show()

This shows the heavy-tailed nature of the network.

The following lines find the top author ID's by degree which we subsequently look up to find out what they study.

In [3]:
degrees = np.random.random(7)
G.nodes[np.argsort(degrees)[:5]]

array([0.20084418, 0.21006797, 0.28561496, 0.53535729, 0.63216144])

In [None]:
spearman_cor = st.spearmanr(G.nodes['median_citation_count'], degrees)
plt.scatter(G.nodes['median_citation_count'], degrees)
plt.title(f'Median Citation Count vs Degree Count')
plt.xlabel(f'Median Citation Count')
plt.ylabel(f'Degree Count')
plt.show()

The spearman correlation s more robust to outliers, and it seems there are are couple of these in the data.

In [12]:
# top 5 author's names:

import requests
import pandas as pd

BASE_URL = "https://api.semanticscholar.org/graph/"
VERSION = "v1/"
RESOURCE = "author/batch"
URL = BASE_URL + VERSION + RESOURCE
id_list = [
    3847277,
    118515406,
    48961279,
    150304655,
    34201396,
]

json_data = {"ids": [str(i) for i in id_list]}
r = requests.post(URL, params={"fields": "authorId,name,citationCount"}, json=json_data)
print("total citationcount: ", sum([i["citationCount"] for i in r.json()]))
r.json()

total citationcount:  11739


[{'authorId': '3847277', 'name': 'Y. Michalakis', 'citationCount': 9111},
 {'authorId': '118515406', 'name': 'J. Corvino', 'citationCount': 199},
 {'authorId': '48961279', 'name': 'Adam Jochem', 'citationCount': 1210},
 {'authorId': '150304655', 'name': 'Sean R Peters', 'citationCount': 47},
 {'authorId': '34201396', 'name': 'M. Bhattacharyya', 'citationCount': 1172}]

Y. Michalakis seems to have worked on a great number of papers, with many different subjects, and it's hard to pinpoint what type of researcher he is.

J. Corvino has worked on several papers related to the same subject, same-sex marriage and morality, it seems like he is a specialist in this area.

Adam Jochem has worked on some papers related to the mitocondria and other biology subjects, ergo he's probably a biologist.

Sean R Peters with only a few works to base a conclusion on, it seems he's a biologist.

Malay Bhattacharyya is a computer scientist.