In [1]:
import requests
import json
from numerize import numerize

In [2]:
polite = 'mailto=udevz@student.kit.edu'

In [3]:
# search for specific type and using a search phrase, then retrieve author's works url and display the titles of his/her works
wanted_type = ['works','authors','venues','institutions','concepts']

In [4]:
# define request function for convenience.
# Concatenates request and polite header, sends request and returns response in JSON

def request(api_call):
    req_string = str(api_call+'&'+polite)
    req = requests.get(req_string)
    if req.status_code == 200:
        return req.json()
    else:
        print('Request not successful ', req.status_code)
        return

In [5]:
def random_author():
    return request('https://api.openalex.org/authors/random?')

In [6]:
# check items for each publication year
req_json = request('https://api.openalex.org/authors?')
print('Author count total: ',numerize.numerize(req_json['meta']['count'])) # ~244 mio. on 24. June
print('Author count total: ',(req_json['meta']['count']))

Author count total:  263.18M
Author count total:  263184238


In [8]:
# how many of these authors have their canonical external ID (ORCID) maintained?
req_json = request('https://api.openalex.org/authors?filter=has_orcid:true')
print('Author ORCID count total:',numerize.numerize(req_json['meta']['count'])) # ~4.87 mio. on 22. June
print('That is',numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '% of all authors')
req_json['meta']['count']

Author ORCID count total: 4.66M
That is 1.77 % of all authors


4655832

In [14]:
# ORCID for years #HOW-TO??
req_json = request('https://api.openalex.org/authors?filter=has_orcid:true')
print('Author ORCID count total:',numerize.numerize(req_json['meta']['count']))


Author ORCID count total: 4.61M


In [8]:
# how many of these authors have more than 1 work associated
req_json = request('https://api.openalex.org/authors?filter=works_count:>1')
print('Authors with more than 1 work: ',numerize.numerize(req_json['meta']['count'])) #
print('That is ',numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '% of all authors')

Authors with more than 1 work:  43.4M
That is  16.55 % of all authors


In [9]:
    # how many of these authors have exactly 1 work associated
req_json = request('https://api.openalex.org/authors?filter=works_count:1')
print('Authors with exactly 1 work: ',numerize.numerize(req_json['meta']['count'])) #
print('That is ',numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '% of all authors')

Authors with exactly 1 work:  216.54M
That is  82.56 % of all authors


In [10]:
# how many of these authors have less than one work associated (no publ. at all) ???!
req_json = request('https://api.openalex.org/authors?filter=works_count:0')
print('Authors with less than 1 work: ',numerize.numerize(req_json['meta']['count'])) #
print('That is ',numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '% of all authors')

Authors with less than 1 work:  1.29M
That is  0.49 % of all authors


In [18]:
# how many of these authors have more than 1 work associated
req_json = request('https://api.openalex.org/authors?filter=works_count:2')
print('Authors with exactly 2 work: ',numerize.numerize(req_json['meta']['count']),numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '%')
req_json = request('https://api.openalex.org/authors?filter=works_count:3')
print('Authors with exactly 3 work: ',numerize.numerize(req_json['meta']['count']),numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '%')
req_json = request('https://api.openalex.org/authors?filter=works_count:4')
print('Authors with exactly 4 work: ',numerize.numerize(req_json['meta']['count']),numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '%')
req_json = request('https://api.openalex.org/authors?filter=works_count:>4')
print('Authors with more than 4 [>=5] work: ',numerize.numerize(req_json['meta']['count']),numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '%') #
print('That is ',numerize.numerize(100*req_json['meta']['count']/request('https://api.openalex.org/authors?')['meta']['count']), '% of all authors')

Authors with exactly 2 work:  19.13M 7.3 %
Authors with exactly 3 work:  7.03M 2.68 %
Authors with exactly 4 work:  3.72M 1.42 %
Authors with more than 4 [>=5] work:  13.5M 5.15 %
That is  5.15 % of all authors


In [43]:
# curious: see most cited authors overall
req_json = request('https://api.openalex.org/authors?sort=cited_by_count:desc')

print('Cited by count top five')
for i in range(5):
    print(req_json['results'][i]['display_name'], ' : ', req_json['results'][i]['cited_by_count'])

Cited by count top 5
Douglas G. Altman  :  602089
Eric S. Lander  :  597930
David Moher  :  447149
Walter C. Willett  :  440535
Yoshua Bengio  :  431727


In [44]:
# curious: see highest output authors
req_json = request('https://api.openalex.org/authors?sort=works_count:desc')

print('Works count top five authors')
for i in range(5):
    print(req_json['results'][i]['display_name'], ' : ', req_json['results'][i]['works_count'])


Works count top five authors
Charles Thomas Parker  :  93877
George M. Garrity  :  87350
Dorothea Taylor  :  18953
Ashok Kumar  :  6919
George M Garrity  :  6888


In [None]:
req_json = request('https://api.openalex.org/authors?group_by=last_known_institution.type')
author_per_institution_type = {}
for item in req_json['group_by']:
    author_per_institution_type[item['key_display_name']] = item['count']
    print(item['key_display_name'], item['count'],numerize.numerize(item['count']/request('https://api.openalex.org/authors?')['meta']['count']*100), '%') #)
# many unknowns -> usefulness questionable

In [None]:
# print avg. authors per institution type
# first, # institutions by type
req_json = request('https://api.openalex.org/institutions?group_by=type')
institution_type_count = {}
for item in req_json['group_by']:
    institution_type_count[item['key_display_name']] = item['count']
    print(item['key_display_name'],item['count'])

for k in institution_type_count.keys():
    print('Avg. no. of authors in',k,':',author_per_institution_type[k]/institution_type_count[k])


In [18]:
req_json = request('https://api.openalex.org/authors?group_by=last_known_institution.country_code')
author_per_institution_country = {}
for item in req_json['group_by']:
    author_per_institution_country[item['key_display_name']] = item['count']
# many unknowns -> usefulness questionable

{'unknown': 187674894,
 'United States of America': 13595022,
 'China': 11063692,
 'United Kingdom of Great Britain and Northern Ireland': 2923508,
 'Japan': 2653188,
 'Germany': 2172109,
 'India': 1944796,
 'France': 1639115,
 'Brazil': 1628645,
 'Canada': 1406749,
 'Russian Federation': 1091979,
 'Korea, Republic of': 1024174,
 'Spain': 974068,
 'Italy': 962073,
 'Australia': 894150,
 'Taiwan, Province of China': 806415,
 'Indonesia': 766610,
 'Iran, Islamic Republic of': 715635,
 'Netherlands': 630414,
 'Switzerland': 503001,
 'Mexico': 488849,
 'Poland': 459628,
 'Turkey': 358858,
 'Sweden': 346142,
 'Belgium': 313389,
 'Malaysia': 311354,
 'Egypt': 284903,
 'Israel': 281759,
 'Colombia': 277427,
 'Denmark': 243248,
 'Czechia': 241564,
 'Pakistan': 238385,
 'South Africa': 236352,
 'Austria': 223299,
 'Singapore': 221744,
 'Finland': 211946,
 'Ukraine': 208383,
 'Nigeria': 207030,
 'Argentina': 198758,
 'Greece': 191220,
 'Thailand': 184993,
 'Ireland': 170566,
 'Portugal': 169162,

In [None]:
req_json = request('https://api.openalex.org/authors?group_by=last_known_institution')
author_per_institution = {}
for item in req_json['group_by']:
    author_per_institution_country[item['key_display_name']] = item['count']
# many unknowns -> usefulness questionable
