# Getting Dimensions and Altmetrics data

In this notebook, starting from a list of COVID19 publications with a DOI or PMID, we query the Dimensions and Altmetrics APIs. We then export the results in JSON, according to the format and structure we need at CWTS. Yours might vary, but adapting these scripts should be straightforward.

Dimensions API reference: https://docs.dimensions.ai/dsl/index.html
Altmetrics API reference: http://api.altmetric.com

*Please note you will need your own access keys from each of the two APIs to use this code.*

In [15]:
# magics, warnings and imports
%load_ext autoreload
%autoreload 2
import warnings; warnings.simplefilter('ignore')

import os, random, codecs, json, time
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

seed = 99
random.seed(seed)
np.random.seed(seed)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# load the pub dataframe (see Notebook_1 for this)

df_pub = pd.read_csv("datasets_output/df_pub.csv", compression="gzip")

In [17]:
df_pub.shape

(48428, 14)

In [18]:
df_pub.head()

Unnamed: 0.1,Unnamed: 0,pub_id,title,abstract,publication_year,publication_month,journal,volume,issue,pages,doi,pmid,pmcid,timestamp
0,0,0,The Possible Immunological Pathways for the Va...,,2020.0,3.0,Electronic Journal of General Medicine,17.0,4.0,,10.29333/ejgm/7850,,,2020-03-28 08:46:55.291546
1,1,1,A Method of Estimating Time-to-Recovery for a ...,,2020.0,3.0,Research Square,,,,10.21203/rs.3.rs-18190/v1,,,2020-03-28 08:46:55.291546
2,2,2,Preparation for the quarantine of the cruise s...,,2020.0,3.0,JMIR Preprints,,,,10.2196/preprints.18821,,,2020-03-28 08:46:55.291546
3,3,3,Differences and similarities between Severe Ac...,,2020.0,3.0,European review for medical and pharmacologica...,24.0,5.0,2781-2783,10.26355/eurrev_202003_20551,32196628.0,,2020-03-28 08:46:55.291546
4,4,4,From SARS-CoV to SARS-CoV-2: The response and ...,Abstract:,2020.0,2.0,Fa yi xue za zhi,36.0,1.0,1-3,10.12116/j.issn.1004-5619.2020.01.001,32198983.0,,2020-03-28 08:46:55.291546


In [19]:
# let's get identifiers out

dois = df_pub[pd.notna(df_pub.doi)].doi.values
pmids = df_pub[(pd.isna(df_pub.doi)) & (pd.notna(df_pub.pmid))].pmid.values
pmids = [str(int(i)) for i in pmids]
pmcids = df_pub[(pd.isna(df_pub.doi)) & (pd.isna(df_pub.pmid)) & (pd.notna(df_pub.pmcid))].pmcid.values

In [20]:
print(len(dois))
print(len(pmids))
print(len(pmcids))

45393
2872
163


In [21]:
# some DOIs will need cleaning
def clean_doi(d):
    if isinstance(d,str):
        d = d.replace("https://doi.org/","")
        d = d.replace("doi:","")
        return d
    return d

In [22]:
dois = [clean_doi(d) for d in dois]

### Dimensions

In [23]:
# Schema to convert to CWTS-compatible JSON. Skip this if you prefer to have the Dimensions' schema.

mapping_scheme = {"id": None,
"format": None,
"status": None,
"publication_type": None,
"doi": None,
"pmid": None,
"pmcid": None,
"title": None,
"year" : None,
"publication_date" : None,
"volume" : None,
"issue" : None,
"pages" : None,
"open_access_versions": [],
"concepts": {},
"journal": {"id": None, "title": None, "issn": None, "eissn": None}, 
"publisher": {"id": None, "name": None},
"open_access_categories": [],
"journal_lists": [],
"author_affiliations": [], #"author_affiliations": [{"first_name": "Sunir", "last_name": "Gohil", "researcher_id": "ur.01154753576.20", "grid_ids": []}, {"first_name": "Sabine", "last_name": "Vuik", "researcher_id": "ur.015721262671.44", "grid_ids": []}, {"first_name": "Ara", "last_name": "Darzi", "researcher_id": "ur.01255016073.58", "grid_ids": []}]
"funding": [], #SKIP for now
"for": [], # [{"first_level": {"id": "11", "name": "Medical and Health Sciences"}, "second_level": {"id": "1117", "name": "Public Health and Health Services"}}]
"language": None,
"references": [],
"clinical_trials": [], #SKIP for now
"created_in_dimensions" : None,
"version_of_record" : None,
"times_cited" : None, #NEW
"relative_citation_ratio" : None #NEW
}

In [39]:
# https://app.dimensions.ai/browse/categories/publication/for
for_highest_level = {"01":"Mathematical Sciences",
"02":"Physical Sciences",
"03": "Chemical Sciences",
"04": "Earth Sciences",
"05": "Environmental Sciences",
"06": "Biological Sciences",
"07": "Agricultural and Veterinary Sciences",
"08": "Information and Computing Sciences",
"09": "Engineering",
"10": "Technology",
"11": "Medical and Health Sciences",
"12": "Built Environment and Design",
"13": "Education",
"14": "Economics",
"15": "Commerce, Management, Tourism and Services",
"16": "Studies in Human Society",
"17": "Psychology and Cognitive Sciences",
"18": "Law and Legal Studies",
"19": "Studies in Creative Arts and Writing",
"20": "Language, Communication and Culture",
"21": "History and Archaeology",
"22": "Philosophy and Religious Studies"}

In [40]:
# This is ugly and could be improved
import copy

def convert_json(input_from_api, mapping_scheme=mapping_scheme):
    new_json = copy.deepcopy(mapping_scheme)
    # direct fields
    new_json["title"] = input_from_api["title"]
    new_json["id"] = input_from_api["id"]
    if "doi" in input_from_api.keys():
        new_json["doi"] = input_from_api["doi"]
    new_json["publication_type"] = input_from_api["type"]
    if "year" in input_from_api.keys():
        new_json["year"] = input_from_api["year"]
    if "date" in input_from_api.keys():
        new_json["publication_date"] = input_from_api["date"]
    new_json["times_cited"] = input_from_api["times_cited"]
    if "references" in input_from_api.keys():
        new_json["references"] = input_from_api["references"]
    if "relative_citation_ratio" in input_from_api.keys():
        new_json["relative_citation_ratio"] = input_from_api["relative_citation_ratio"]
    if "volume" in input_from_api.keys():
        new_json["volume"] = input_from_api["volume"]
    if "issue" in input_from_api.keys():
        new_json["issue"] = input_from_api["issue"]
    if "pages" in input_from_api.keys():
        new_json["pages"] = input_from_api["pages"]
    if "pmid" in input_from_api.keys():
        new_json["pmid"] = input_from_api["pmid"]
    if "pmcid" in input_from_api.keys():
        new_json["pmcid"] = input_from_api["pmcid"]
    if "concepts" in input_from_api.keys():
        for c in input_from_api["concepts"]:
            new_json["concepts"].update({c:1.0})
    if "journal" in input_from_api.keys():
        new_json["journal"]["id"] = input_from_api["journal"]["id"]
        new_json["journal"]["title"] = input_from_api["journal"]["title"]
        if "issn" in input_from_api.keys():
            new_json["journal"]["issn"] = input_from_api["issn"][0]
            if len(input_from_api["issn"])>1:
                new_json["journal"]["eissn"] = input_from_api["issn"][1]
    if "publisher" in input_from_api.keys():
        new_json["publisher"]["name"] = input_from_api["publisher"]
    if "open_access" in input_from_api.keys():
        new_json["open_access_categories"] = input_from_api["open_access"]
    if "journal_lists" in input_from_api.keys():
        new_json["journal_lists"] = input_from_api["journal_lists"]
    if "author_affiliations" in input_from_api.keys():
        for affiliation in input_from_api["author_affiliations"]:
            for researcher in affiliation:
                new_researcher = {"first_name": researcher["first_name"], "last_name": researcher["last_name"], "researcher_id": researcher["researcher_id"], "grid_ids": []}
                if "affiliations" in researcher.keys():
                    new_researcher["grid_ids"] = [x["id"] for x in researcher["affiliations"] if "id" in x.keys()]
                new_json["author_affiliations"].append(new_researcher)
    if "FOR" in input_from_api.keys():
        for item in input_from_api["FOR"]:
            item_id = item["name"][:4]
            upper_item_id = item_id[:2]
            item_name = item["name"][5:]
            new_json["for"].append({"first_level":{"id": upper_item_id,"name": for_highest_level[upper_item_id]},"second_level":{"id": item_id,"name": item_name}})
    return new_json

In [41]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [42]:
# get credentials key
# USE YOURS HERE

import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
dimensions_username = config["DIMENSIONS"]["username"]
dimensions_password = config["DIMENSIONS"]["password"]

In [43]:
import requests

#   The credentials to be used
login = {
    'username': dimensions_username,
    'password': dimensions_password
}

#   Send credentials to login url to retrieve token. Raise
#   an error, if the return code indicates a problem.
#   Please use the URL of the system you'd like to access the API
#   in the example below.
resp = requests.post('https://app.dimensions.ai/api/auth.json', json=login)
resp.raise_for_status()

#   Create http header using the generated token.
headers = {
    'Authorization': "JWT " + resp.json()['token']
}

In [44]:
all_results = list()
payloads = {"pmcid":pmcids,"pmid":pmids,"doi":dois}

In [45]:
# get and save all results for DOIs - CWTS format
out_folder = "datasets_output/json_dimensions_cwts"
query_template_1 = 'search publications where %s in ["'
query_template_2 = '"] return publications[basics+extras+pmcid+publisher+journal_lists+concepts+issn+altmetric_id] limit 300'
limit = 300
current_payload = list()

for key,payload in payloads.items():
    for n,i in tqdm(enumerate(payload)):
        current_payload.append(i)
        if (n > 0 and n % limit == 0) or n >= (len(payload)-1): # query Dimensions, limit reached
            #print((query_template_1+'","'.join(current_payload)+query_template_2))
            resp = requests.post(
                'https://app.dimensions.ai/api/dsl.json',
                data=(query_template_1%key+'","'.join(current_payload)+query_template_2).encode(),
                headers=headers)
            current_payload = list()
            #print(resp.json())

            #   Display raw result
            r = resp.json()
            #print(r["_stats"]["total_count"])
            #print(len(r["publications"]))

            all_results.extend([convert_json(result) for result in r["publications"]])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [46]:
print(len(all_results))

47353


In [47]:
# store data
for n,chunk in enumerate(chunks(all_results,10000)):
    with codecs.open(os.path.join(out_folder,"chunk_%d"%n)+".json","w") as f:
        for r in chunk:
            json.dump(r, f)
            f.write("\n")

In [18]:
# get DOIs from Dimensions out
dois_dimensions = [clean_doi(d["doi"]) for d in all_results]
pmids_dimensions = [d["pmid"] for d in all_results if not d["doi"]]

In [19]:
print(len(dois_dimensions))
print(len(pmids_dimensions))

47353
971


In [20]:
all_dois = list(set(dois).union(set(dois_dimensions)))
all_pmids = list(set(pmids).union(set(pmids_dimensions)))

In [37]:
print(len(all_dois))
print(len(all_pmids))
print(len(dois))
print(len(pmids))
print(len(pmcids))

47115
2873
45393
2872
163


### Altmetrics

Note the Altmetrics API cannot be queried by PMCID, so we try to use all available DOIs and PMIDs from Dimensions.

In [38]:
# get API key
# USE YOURS HERE

import configparser
config = configparser.ConfigParser()
config.read("credentials/conf.ini")
api_key = config["ALTMETRICS"]["key"]
#api_key = config["ALTMETRICS"]["key2"]
payload = {'key': api_key}

In [39]:
import requests, time

doi_base_url = "http://api.altmetric.com/v1/fetch/doi/"
pmid_base_url = "http://api.altmetric.com/v1/fetch/pmid/"

out_folder = "datasets_output/json_altmetrics_cwts"
all_tweet_ids = list()

In [40]:
# Query by DOI
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

# this scaffolding is needed to avoid the request per second limitation of Altmetrics, which is variably enforced 
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for n,doi in tqdm(enumerate(all_dois)):
    if not doi:
        continue
    #if n>0 and (n)%500 == 0:
        #print("Sleeping")
        #time.sleep(60) # avoid being banned
    r = session.get(doi_base_url+doi, params=payload)
    if not r.status_code == 200:
        #print(r.headers)
        if r.status_code == 429: # means API limitations, we need to backoff
            all_dois.append(doi)
            time.sleep(60)
        continue
        
    f_name = doi.replace(".","_")
    f_name = f_name.replace("/",":")
    with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
        json.dump(r.json(), f)
    if "posts" in r.json().keys():
        if isinstance(r.json()["posts"],dict) and "twitter" in r.json()["posts"].keys():
            for tweet in r.json()["posts"]["twitter"]:
                all_tweet_ids.append((doi,"",tweet["tweet_id"],tweet["author"]["tweeter_id"]))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [41]:
# Query by PMID

# this scaffolding is needed to avoid the request per second limitation of Altmetrics, which is variably enforced 
session = requests.Session()
retry = Retry(connect=5, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for pmid in tqdm(all_pmids):
    if not pmid:
        continue
    r = session.get(pmid_base_url+str(int(pmid)), params=payload)
    if not r.status_code == 200:
        if r.status_code == 429: # means API limitations, we need to backoff
            all_dois.append(doi)
            time.sleep(60)
        continue
    f_name = str(int(pmid))
    with codecs.open(os.path.join(out_folder,f_name)+".json","w") as f:
        json.dump(r.json(), f)
    if "posts" in r.json().keys():
        if isinstance(r.json()["posts"],dict) and "twitter" in r.json()["posts"].keys():
            for tweet in r.json()["posts"]["twitter"]:
                all_tweet_ids.append(("",str(int(pmid)),tweet["tweet_id"],tweet["author"]["tweeter_id"]))

HBox(children=(FloatProgress(value=0.0, max=2873.0), HTML(value='')))




In [18]:
all_tweet_ids = list(set(all_tweet_ids))

In [19]:
len(all_tweet_ids)

1543154

In [20]:
with codecs.open("datasets_output/all_tweet_ids.csv", "w") as f:
    f.write("doi,pmid,tweet_id,user_id\n")
    for tweet in list(set(all_tweet_ids)):
        f.write(",".join(tweet)+"\n")

In [16]:
# separately export twitter IDs if necessary (to hydrate them)

out_folder = "datasets_output/json_altmetrics_cwts"
all_tweet_ids = list()

for root, dirs, files in os.walk(out_folder):
    for file in files:
        if ".json" in file:
            data = json.loads(codecs.open(os.path.join(root,file)).read())
            if "posts" in data.keys():
                if isinstance(data["posts"],dict) and "twitter" in data["posts"].keys():
                    for tweet in data["posts"]["twitter"]:
                        doi = ""
                        if "doi" in data["citation"]:
                            doi = str(data["citation"]["doi"])
                        pmid = ""
                        if "pmid" in data["citation"]:
                            pmid = str(data["citation"]["pmid"])
                        user_id = ""
                        if "tweeter_id" in tweet["author"].keys():
                            user_id = str(tweet["author"]["tweeter_id"])
                        all_tweet_ids.append((doi,pmid,str(tweet["tweet_id"]),user_id))