In [1]:
import requests
from gql import gql, Client, RequestsHTTPTransport
from IPython.display import display, Markdown
import asyncio
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
import math


In [3]:
pip list

Package                            Version
---------------------------------- -------------------
aiohttp                            3.6.2
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
argh                               0.26.2
asn1crypto                         1.3.0
astroid                            2.4.2
astropy                            4.0.1.post1
async-timeout                      3.0.1
atomicwrites                       1.4.0
attrs                              19.3.0
autopep8                           1.5.3
Babel                              2.8.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.1
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref                  1.0.post1
bcrypt                             3.1.7
beautifulsoup4                     4.9.1
bitarray                     

In [4]:
_transport = RequestsHTTPTransport(
    url='https://api.datacite.org/graphql',
    use_json=True,
)


client = Client(
    transport=_transport,
    fetch_schema_from_transport=True,
)

In [5]:
# paste the designed and tested (graphqli) query 
query_string = """
{
  person(id: "https://orcid.org/0000-0002-2906-2577") {
    id
    name
    datasets {
      nodes {
        id
        downloadCount
        viewCount
        citations {
          totalCount
        }
      }
    }
    publications {
      totalCount
      nodes {
        id
        relatedIdentifiers {
          relatedIdentifier
        }
      }
    }
  }
}

"""

query = gql(query_string)
data = client.execute(query)
display(Markdown((data['person']['name'])))


Nicholas Deutscher

In [6]:
# sort the payload dict for total citation counts
ordered = sorted(data['person']['datasets']['nodes'], key=lambda i: i["citations"]["totalCount"], reverse=True)
ordered[:2]


[{'id': 'https://doi.org/10.14291/tccon.ggg2014.darwin01.r0/1149290',
  'downloadCount': 6,
  'viewCount': 75,
  'citations': {'totalCount': 24}},
 {'id': 'https://doi.org/10.14291/tccon.ggg2014.wollongong01.r0/1149291',
  'downloadCount': 7,
  'viewCount': 65,
  'citations': {'totalCount': 20}}]

In [7]:
# reformat the dict for data analysis 

agg=[]

for dataset in ordered:
    agg.append(
        {'id':dataset['id'],
          'downloadCount': dataset['downloadCount'],
          'viewCount': dataset['viewCount'],
          'citationCount': dataset['citations']['totalCount']
        }
    )
  
    #print(node)
#agg


In [8]:
datasets_data = pd.DataFrame(agg)
datasets_data

Unnamed: 0,id,downloadCount,viewCount,citationCount
0,https://doi.org/10.14291/tccon.ggg2014.darwin0...,6,75,24
1,https://doi.org/10.14291/tccon.ggg2014.wollong...,7,65,20
2,https://doi.org/10.14291/tccon.ggg2014.orleans...,1,18,19
3,https://doi.org/10.14291/tccon.ggg2014.bremen0...,2,18,15
4,https://doi.org/10.14291/tccon.ggg2014.bialyst...,1,21,11
5,https://doi.org/10.14291/tccon.ggg2014.nyalesu...,1,10,1
6,https://doi.org/10.1594/pangaea.848263,0,0,0
7,https://doi.org/10.1594/pangaea.872007,0,0,0
8,https://doi.org/10.14291/tccon.ggg2014.nyalesu...,1,44,0
9,https://doi.org/10.14291/tccon.ggg2014.bremen0...,2,51,0


In [9]:
display(datasets_data.describe())
display(datasets_data.sum())

Unnamed: 0,downloadCount,viewCount,citationCount
count,12.0,12.0,12.0
mean,2.583333,32.583333,7.5
std,2.466441,25.256712,9.577436
min,0.0,0.0,0.0
25%,1.0,16.0,0.0
50%,1.5,27.5,0.5
75%,5.0,52.0,16.0
max,7.0,75.0,24.0


id               https://doi.org/10.14291/tccon.ggg2014.darwin0...
downloadCount                                                   31
viewCount                                                      391
citationCount                                                   90
dtype: object

In [10]:
def h_index(citationarray):
    descending_df = pd.DataFrame(citationarray)
    descending_df.columns = ['count']
    descending_df=descending_df.sort_values(by=['count'], ascending=False)
    counter = [*range(len(descending_df['count'].index),0,-1)]
    for dC in [*range(len(descending_df['count'].index),0,-1)]:
        descending_df[dC] = descending_df.apply(lambda x : True
                if x['count'] >= dC else False, axis = 1)

        counter[len(descending_df['count'].index)-dC] = len(descending_df[dC][descending_df[dC] == True].index)
    summary = pd.DataFrame({'count': descending_df['count'],'citation': [*range(len(descending_df['count'].index),0,-1)],'h_index': counter})
    h=summary.loc[summary['citation'] == summary['h_index']]['h_index'].array[0]
    return h


In [11]:
def i10_index(citationarray):
    series = citationarray.apply(lambda x: True if x > 0 else False)
    cited_papers = len(series[series == True].index)
    i10_column = citationarray.apply(lambda x: True if x >= 10 else False)
    return len(i10_column[i10_column == True].index)

In [12]:
def g_index(citationarray):  
    descending_df = pd.DataFrame(citationarray)
    descending_df.columns = ['count']
    descending_df=descending_df.sort_values(by=['count'], ascending=False)
    series = descending_df.apply(lambda x: True if x['count'] > 0 else False , axis=1)
    cited_papers = len(series[series == True].index)

    ranger = [cited_papers,0,-1]
    g_max = math.floor(np.sqrt(descending_df['count'].sum()))

    if g_max <= cited_papers:
        g = g_max
    else:
        g = 0
        k = True
        while k:
            g += 1 
            if g >= g_max:
                k = False
    return g

In [13]:

sums=pd.DataFrame(datasets_data.sum(), columns=['sum'])
sums = sums.drop('id')
sums['h-index (Young)'] = (0.54*np.sqrt(sums['sum'].astype(np.float64))).round(1)
sums['h-index'] = [h_index(datasets_data['downloadCount']),h_index(datasets_data['viewCount']),h_index(datasets_data['citationCount'])]
sums['i10-index'] = [i10_index(datasets_data['downloadCount']),i10_index(datasets_data['viewCount']),i10_index(datasets_data['citationCount'])]
sums['g-index'] = [g_index(datasets_data['downloadCount']),g_index(datasets_data['viewCount']),g_index(datasets_data['citationCount'])]
display(sums)


Unnamed: 0,sum,h-index (Young),h-index,i10-index,g-index
downloadCount,31,3.0,4,0,5
viewCount,391,10.7,10,10,19
citationCount,90,5.1,5,5,9


In [15]:
dCwt=2
vCwt=1
cCwt=4
datasets_data['Index'] = (datasets_data['downloadCount'] + datasets_data['viewCount'] + datasets_data['citationCount'])
datasets_data['wtIndex'] = (dCwt*datasets_data['downloadCount'] + vCwt*datasets_data['viewCount'] + cCwt*datasets_data['citationCount'])
#datasets_data['expIndex'] = 10*np.exp((-(datasets_data['downloadCount']/10 + datasets_data['viewCount']/20 + datasets_data['citationCount'])))

display(datasets_data)

Unnamed: 0,id,downloadCount,viewCount,citationCount,Index,wtIndex
0,https://doi.org/10.14291/tccon.ggg2014.darwin0...,6,75,24,105,183
1,https://doi.org/10.14291/tccon.ggg2014.wollong...,7,65,20,92,159
2,https://doi.org/10.14291/tccon.ggg2014.orleans...,1,18,19,38,96
3,https://doi.org/10.14291/tccon.ggg2014.bremen0...,2,18,15,35,82
4,https://doi.org/10.14291/tccon.ggg2014.bialyst...,1,21,11,33,67
5,https://doi.org/10.14291/tccon.ggg2014.nyalesu...,1,10,1,12,16
6,https://doi.org/10.1594/pangaea.848263,0,0,0,0,0
7,https://doi.org/10.1594/pangaea.872007,0,0,0,0,0
8,https://doi.org/10.14291/tccon.ggg2014.nyalesu...,1,44,0,45,46
9,https://doi.org/10.14291/tccon.ggg2014.bremen0...,2,51,0,53,55
