# Getting Dimensions and Altmetrics data

In this notebook, starting from a list of COVID19 publications with a DOI or PMID, we query the Dimensions and Altmetrics APIs. We then export the results in JSON, according to the format and structure we need at CWTS. Yours might vary, but adapting these scripts should be straightforward.

Dimensions API reference: https://docs.dimensions.ai/dsl/index.html
Altmetrics API reference: http://api.altmetric.com

*Please note you will need your own access keys from each of the two APIs to use this code.*

In [96]:
# magics, warnings and imports
%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json, time
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

seed = 99
random.seed(seed)
np.random.seed(seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# load the pub dataframe (see Notebook_1 for this)

df_pub = pd.read_csv("datasets_output/df_pub.csv", compression="gzip")

In [3]:
df_pub.shape

(45646, 14)

In [11]:
df_pub.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,id,timestamp
0,0,COVID-19: The New Threat,,2020.0,3.0,International Journal of Infection,7.0,1.0,,10.5812/iji.102184,,,0,2020-03-25 18:16:28.508540
1,1,Prominent changes in blood coagulation of pati...,Abstract Background As the number of patients...,2020.0,3.0,Clinical Chemistry and Laboratory Medicine,0.0,0.0,,10.1515/cclm-2020-0188,,,1,2020-03-25 18:16:28.508540
2,2,What Is Needed to Make Interventional Radiolog...,,2020.0,1.0,Korean Journal of Radiology,21.0,,,10.3348/kjr.2020.0163,,,2,2020-03-25 18:16:28.508540
3,3,COVID-19 Infection in Iranian Children: A Case...,,2020.0,4.0,Journal of Pediatrics Review,,,139-144,10.32598/jpr.8.2.139,,,3,2020-03-25 18:16:28.508540
4,4,Computed Tomographic Findings in COVID-19,,2020.0,1.0,Korean Journal of Radiology,21.0,,,10.3348/kjr.2020.0164,,,4,2020-03-25 18:16:28.508540


In [29]:
# let's get identifiers out

dois = df_pub[pd.notna(df_pub.doi)].doi.values
pmids = df_pub[(pd.isna(df_pub.doi)) & (pd.notna(df_pub.pmid))].pmid.values
pmids = [str(int(i)) for i in pmids]
pmcids = df_pub[(pd.isna(df_pub.doi)) & (pd.isna(df_pub.pmid)) & (pd.notna(df_pub.pmcid))].pmcid.values

In [30]:
print(len(dois))
print(len(pmids))
print(len(pmcids))

42538
336
2772


In [31]:
# some DOIs will need cleaning
def clean_doi(d):
    if isinstance(d,str):
        d = d.replace("https://doi.org/","")
        d = d.replace("doi:","")
        return d
    return d

In [32]:
dois = [clean_doi(d) for d in dois]

### Dimensions

In [33]:
# Schema to convert to CWTS-compatible JSON. Skip this if you prefer to have the Dimensions' schema.

mapping_scheme = {"id": None,
"format": None,
"status": None,
"publication_type": None,
"doi": None,
"pmid": None,
"pmcid": None,
"title": None,
"year" : None,
"publication_date" : None,
"volume" : None,
"issue" : None,
"pages" : None,
"open_access_versions": [],
"concepts": {},
"journal": {"id": None, "title": None, "issn": None, "eissn": None}, 
"publisher": {"id": None, "name": None},
"open_access_categories": [],
"journal_lists": [],
"author_affiliations": [], #"author_affiliations": [{"first_name": "Sunir", "last_name": "Gohil", "researcher_id": "ur.01154753576.20", "grid_ids": []}, {"first_name": "Sabine", "last_name": "Vuik", "researcher_id": "ur.015721262671.44", "grid_ids": []}, {"first_name": "Ara", "last_name": "Darzi", "researcher_id": "ur.01255016073.58", "grid_ids": []}]
"funding": [], #SKIP for now
"for": [], # [{"first_level": {"id": "11", "name": "Medical and Health Sciences"}, "second_level": {"id": "1117", "name": "Public Health and Health Services"}}]
"language": None,
"references": [],
"clinical_trials": [], #SKIP for now
"created_in_dimensions" : None,
"version_of_record" : None,
"times_cited" : None, #NEW
"relative_citation_ratio" : None #NEW
}

In [46]:
# This is ugly and could be improved
import copy

def convert_json(input_from_api, mapping_scheme=mapping_scheme):
    new_json = copy.deepcopy(mapping_scheme)
    # direct fields
    new_json["title"] = input_from_api["title"]
    new_json["id"] = input_from_api["id"]
    if "doi" in input_from_api.keys():
        new_json["doi"] = input_from_api["doi"]
    new_json["publication_type"] = input_from_api["type"]
    if "year" in input_from_api.keys():
        new_json["year"] = input_from_api["year"]
    if "date" in input_from_api.keys():
        new_json["publication_date"] = input_from_api["date"]
    new_json["times_cited"] = input_from_api["times_cited"]
    if "references" in input_from_api.keys():
        new_json["references"] = input_from_api["references"]
    if "relative_citation_ratio" in input_from_api.keys():
        new_json["relative_citation_ratio"] = input_from_api["relative_citation_ratio"]
    if "volume" in input_from_api.keys():
        new_json["volume"] = input_from_api["volume"]
    if "issue" in input_from_api.keys():
        new_json["issue"] = input_from_api["issue"]
    if "pages" in input_from_api.keys():
        new_json["pages"] = input_from_api["pages"]
    if "pmid" in input_from_api.keys():
        new_json["pmid"] = input_from_api["pmid"]
    if "pmcid" in input_from_api.keys():
        new_json["pmcid"] = input_from_api["pmcid"]
    if "concepts" in input_from_api.keys():
        for c in input_from_api["concepts"]:
            new_json["concepts"].update({c:1.0})
    if "journal" in input_from_api.keys():
        new_json["journal"]["id"] = input_from_api["journal"]["id"]
        new_json["journal"]["title"] = input_from_api["journal"]["title"]
        if "issn" in input_from_api.keys():
            new_json["journal"]["issn"] = input_from_api["issn"][0]
            if len(input_from_api["issn"])>1:
                new_json["journal"]["eissn"] = input_from_api["issn"][1]
    if "publisher" in input_from_api.keys():
        new_json["publisher"]["name"] = input_from_api["publisher"]
    if "open_access" in input_from_api.keys():
        new_json["open_access_categories"] = input_from_api["open_access"]
    if "journal_lists" in input_from_api.keys():
        new_json["journal_lists"] = input_from_api["journal_lists"]
    if "author_affiliations" in input_from_api.keys():
        for affiliation in input_from_api["author_affiliations"]:
            for researcher in affiliation:
                new_researcher = {"first_name": researcher["first_name"], "last_name": researcher["last_name"], "researcher_id": researcher["researcher_id"], "grid_ids": []}
                if "affiliations" in researcher.keys():
                    new_researcher["grid_ids"] = [x["id"] for x in researcher["affiliations"] if "id" in x.keys()]
                new_json["author_affiliations"].append(new_researcher)
    if "FOR" in input_from_api.keys():
        if len(input_from_api["FOR"])>0:
            new_json["for"].append({"first_level":{"id": input_from_api["FOR"][0]["id"],"name": input_from_api["FOR"][0]["name"]}})
        if len(input_from_api["FOR"])>1: 
            new_json["for"].append({"second_level":{"id": input_from_api["FOR"][1]["id"],"name": input_from_api["FOR"][1]["name"]}})
    return new_json

In [47]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [53]:
# get credentials key
# USE YOURS HERE

import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
dimensions_username = config["DIMENSIONS"]["username"]
dimensions_password = config["DIMENSIONS"]["password"]

In [54]:
import requests

#   The credentials to be used
login = {
    'username': dimensions_username,
    'password': dimensions_password
}

#   Send credentials to login url to retrieve token. Raise
#   an error, if the return code indicates a problem.
#   Please use the URL of the system you'd like to access the API
#   in the example below.
resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
resp.raise_for_status()

#   Create http header using the generated token.
headers = {
    'Authorization': "JWT " + resp.json()['token']
}

In [55]:
all_results = list()
payloads = {"pmcid":pmcids,"pmid":pmids,"doi":dois}

In [56]:
# get and save all results for DOIs - CWTS format
out_folder = "datasets_output/json_dimensions_cwts"
query_template_1 = 'search publications where %s in ["'
query_template_2 = '"] return publications[basics+extras+pmcid+publisher+journal_lists+concepts+issn+altmetric_id] limit 300'
limit = 300
current_payload = list()

for key,payload in payloads.items():
    for n,i in tqdm(enumerate(payload)):
        current_payload.append(i)
        if (n > 0 and n % limit == 0) or n >= (len(payload)-1): # query Dimensions, limit reached
            #print((query_template_1+'","'.join(current_payload)+query_template_2))
            resp = requests.post(
                'https://app.dimensions.ai/api/dsl.json',
                data=(query_template_1%key+'","'.join(current_payload)+query_template_2).encode(),
                headers=headers)
            current_payload = list()
            #print(resp.json())

            #   Display raw result
            r = resp.json()
            #print(r["_stats"]["total_count"])
            #print(len(r["publications"]))

            all_results.extend([convert_json(result) for result in r["publications"]])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [58]:
print(len(all_results))

44097


In [59]:
# store data
for n,chunk in enumerate(chunks(all_results,10000)):
    with codecs.open(os.path.join(out_folder,"chunk_%d"%n)+".json","w") as f:
        for r in chunk:
            json.dump(r, f)
            f.write("\n")

In [64]:
# get DOIs from Dimensions out
dois_dimensions = [clean_doi(d["doi"]) for d in all_results]
pmids_dimensions = [d["pmid"] for d in all_results if not d["doi"]]

In [65]:
print(len(dois_dimensions))
print(len(pmids_dimensions))

44097
976


In [66]:
all_dois = list(set(dois).union(set(dois_dimensions)))
all_pmids = list(set(pmids).union(set(pmids_dimensions)))

In [67]:
print(len(all_dois))
print(len(all_pmids))
print(len(dois))
print(len(pmids))
print(len(pmcids))

55005
1181
42538
336
2772


### Altmetrics

Note the Altmetrics API cannot be queried by PMCID, so we try to use all available DOIs and PMIDs from Dimensions.

In [72]:
# get API key
# USE YOURS HERE

import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
api_key = config["ALTMETRICS"]["key"]
payload = {'key': api_key}

In [89]:
import requests, time

doi_base_url = "http://api.altmetric.com/v1/fetch/doi/"
pmid_base_url = "http://api.altmetric.com/v1/fetch/pmid/"

out_folder = "datasets_output/json_altmetrics_cwts"

In [94]:
# Query by DOI
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# this scaffolding is needed to avoid the request per second limitation of Altmetrics, which is variably enforced 
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for n,doi in tqdm(enumerate(all_dois)):
    if not doi:
        continue
    if n>0 and (n)%500 == 0:
        #print("Sleeping")
        time.sleep(60) # avoid being banned
    r = session.get(doi_base_url+doi, params=payload)
    if not r.status_code == 200:
        if r.status_code == 429: # means API limitations, we need to backoff
            all_dois.append(doi)
            time.sleep(60)
        continue
        
    f_name = doi.replace(".","_")
    f_name = f_name.replace("/",":")
    with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
        json.dump(r.json(), f, indent=4)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [97]:
# Query by PMID

# this scaffolding is needed to avoid the request per second limitation of Altmetrics, which is variably enforced 
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for pmid in tqdm(all_pmids):
    if not pmid:
        continue
    r = session.get(pmid_base_url+str(int(pmid)), params=payload)
    if not r.status_code == 200:
        if r.status_code == 429: # means API limitations, we need to backoff
            all_dois.append(doi)
            time.sleep(60)
        continue
    f_name = str(int(pmid))
    with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
        json.dump(r.json(), f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=1181.0), HTML(value='')))


