In [1]:
import requests
import time
import os
import urllib.parse
import pandas as pd
import random
from bs4 import BeautifulSoup   #for parsing xml and html
from random import randint  
from dotenv import load_dotenv 
load_dotenv()

True

In [2]:
BASEURL_ST = 'https://api.clarivate.com/apis/wos-starter/v1/'
HEADERS_ST = {'X-APIKey': os.getenv("APIKEY")}

In [3]:
SEARCH_QUERY = "OG=(Dartmouth College)"
initial_request = requests.get(f'{BASEURL_ST}documents?db=WOS&q={urllib.parse.quote(SEARCH_QUERY)}', headers=HEADERS_ST)
data = initial_request.json()
total_records = data['metadata']['total']
print(total_records)
data

77866


{'metadata': {'total': 77866, 'page': 1, 'limit': 10},
 'hits': [{'uid': 'WOS:000202856600007',
   'title': 'Contributions from the cryptogamic laboratory of Harvard University. XLIV. New or little known unicellular algae. I. Chlorocystis cohnii.',
   'types': ['Article'],
   'sourceTypes': ['Article'],
   'source': {'sourceTitle': 'BOTANICAL GAZETTE',
    'publishYear': 1900,
    'publishMonth': 'JUL-DEC',
    'volume': '30',
    'issue': '1-6',
    'pages': {'range': '100-U27', 'begin': '100', 'end': 'U27', 'count': 14}},
   'names': {'authors': [{'displayName': 'Moore, George Thomas',
      'wosStandard': 'Moore, GT',
      'researcherId': 'DGM-8444-2022'}]},
   'links': {'record': 'https://www.webofscience.com/api/gateway?GWVersion=2&SrcApp=dartrds_jeremy_01&SrcAuth=WosAPI&KeyUT=WOS:000202856600007&DestLinkType=FullRecord&DestApp=WOS_CPL',
    'citingArticles': 'https://www.webofscience.com/api/gateway?GWVersion=2&SrcApp=dartrds_jeremy_01&SrcAuth=WosAPI&KeyUT=WOS:000202856600007&De

In [12]:
def retrieve_all_data(datahits):
    #hits = [hit for hit in data['hits']]
    datalist = []
    for hit in datahits:
        datadict = {}
        datadict['uid'] = hit.get("uid", "")
        datadict['title'] = hit.get("title", "")
        try:
            datadict['authors'] = "; ".join([name.get('wosStandard') for name in hit.get("names").get("authors")])
        except TypeError:
            datadict['authors'] = ""
        try:
            datadict['researcherIds'] = "; ".join([str(name.get('researcherId')) for name in hit.get("names").get("authors")])
        except TypeError:
            datadict['researcherIds'] = ""
        datadict['pubyear'] = hit.get("source").get("publishYear")
        datadict['source_title'] = hit.get("source").get("sourceTitle")
        datadict['volume'] = hit.get("source").get("volume")
        datadict['page_start'] = hit.get("source").get("pages").get("begin")
        datadict['page_end'] = hit.get("source").get("pages").get("end")
        datadict['page_count'] = hit.get("source").get("pages").get("count")
        identifiers = hit.get("identifiers")
        datadict['doi'] = identifiers.get("doi")
        datadict['issn'] = identifiers.get("issn")
        datadict['eissn'] = identifiers.get("eissn")
        datadict['isbn'] = identifiers.get("isbn")
        citations = hit.get("citations")
        if len(citations) > 0:   #for some reason the citations key stores a dict inside a list
            datadict["citation_counts"] = citations[0].get("count")
         
        datadict['author_keywords'] = "; ".join([kw.lower() for kw in hit.get("keywords").get("authorKeywords")])
        #datadict['keywords_plus'] = hit.get("keywords").get("keywordsPlus")
        links = hit.get("links")
        datadict['record_links'] = links.get("record")
        datadict['citing_links'] = links.get("citingArticles")
        datadict['reference_links'] = links.get("references")
        datadict['related_links'] = links.get("related")
        datalist.append(datadict)
    #print(datalist)
    return(pd.DataFrame(datalist))

Problem: once it reaches page 1001 the API returns the following error:

`{'error': {'status': 400, 'title': 'Invalid syntax for the request', 'details': "The 'db' parameter is not valid for the response."}}`

Is this the result of reaching the data limit for the day / week? Or that the API doesn't allow downloading beyond 50000 records??

In [30]:
#2min 46 sec to retrieve 10k records
print(f"Our current search query: {SEARCH_QUERY} returned {total_records} records.")
start_num = 50000
end_num = 50100
if start_num > 0:
    start_page = (start_num - 1)// 50
else:
    start_page = 0
end_page = (end_num - 1) // 50
print("retrieving pages: ", start_page, end_page)

requests_required = ((total_records - 1) // 50) + 1  #306 records - 1 = 305 // 50 = 6 + 1 = 7
print(requests_required)
datadict = {}
if requests_required > 1:
    print(f"API requests required to get all data from the query - '{SEARCH_QUERY}': {requests_required}")
#for i in range(requests_required):
if end_page > requests_required:
    end_page = requests_required
for i in range(start_page, end_page):
    print(f'{BASEURL_ST}documents?db=WOS&q={SEARCH_QUERY}&limit=50&page={i+1}')
    subsequent_response = requests.get(
        f'{BASEURL_ST}documents?db=WOS&q={SEARCH_QUERY}&limit=50&page={i+1}', headers=HEADERS_ST)
    data = subsequent_response.json()
    if i == 0 or i == start_page:
        print(data)
        print(data['metadata'])
        datadict = data
    else:
        print(data)
        datadict['hits'].extend(data['hits'])
    print(f"**Pulling from Page {i+1} of {requests_required}**")
print(f"Total number of records pulled: {len(datadict['hits'])}")
uids = set([hit['uid'] for hit in datadict['hits']])
print(f"Total number of unique ids: {len(uids)}")
print(f"Number of requests remaining today: {subsequent_response.headers['X-RateLimit-Remaining-Day']}.")   

Our current search query: OG=(Dartmouth College) returned 77866 records.
retrieving pages:  999 1001
1558
API requests required to get all data from the query - 'OG=(Dartmouth College)': 1558
https://api.clarivate.com/apis/wos-starter/v1/documents?db=WOS&q=OG=(Dartmouth College)&limit=50&page=1000
{'metadata': {'total': 77866, 'page': 1000, 'limit': 50}, 'hits': [{'uid': 'WOS:000357045700019', 'title': 'IRREDUCIBLE INDUCED REPRESENTATIONS OF FELL BUNDLE <i>C</i>*-ALGEBRAS', 'types': ['Article'], 'sourceTypes': ['Article'], 'source': {'sourceTitle': 'TRANSACTIONS OF THE AMERICAN MATHEMATICAL SOCIETY', 'publishYear': 2015, 'publishMonth': 'JUL', 'volume': '367', 'issue': '7', 'articleNumber': 'PII S0002-9947(2014)06316-2', 'pages': {'range': '5059-5079', 'begin': '5059', 'end': '5079', 'count': 21}}, 'names': {'authors': [{'displayName': 'Ionescu, Marius', 'wosStandard': 'Ionescu, M', 'researcherId': 'FCT-1362-2022'}, {'displayName': 'Williams, Dana P.', 'wosStandard': 'Williams, DP', 'r

KeyError: 'hits'

In [16]:
df = retrieve_all_data(datadict['hits'])
df.head()

Unnamed: 0,uid,title,authors,researcherIds,pubyear,source_title,volume,page_start,page_end,page_count,doi,issn,eissn,isbn,citation_counts,author_keywords,record_links,citing_links,reference_links,related_links
0,WOS:000284964100612,Alveolar macrophage apoptosis following pneumo...,"Preston, JA; Houghton, AM; Craig, RW; Greaves,...",FRH-1567-2022; FAV-8206-2022; ETB-1517-2022; D...,2010,IMMUNOLOGY,131,181,181,1,,0019-2805,,,0,,https://www.webofscience.com/api/gateway?GWVer...,,,
1,WOS:000283973700282,Evaluation of the BioGenex Xmatrx™ Automated S...,"Schwab, MC; Memoli, VA; Black, CC; Bentley, HA...",DRA-0072-2022; FMC-9581-2022; CHM-5582-2022; E...,2010,JOURNAL OF MOLECULAR DIAGNOSTICS,12,917,917,1,,1525-1578,,,0,,https://www.webofscience.com/api/gateway?GWVer...,,,
2,WOS:000283973700279,Establishing a CYP2C19 Genotyping Assay for Cl...,"Cervinski, MA; Schwab, MC; Lefferts, JA; Lewis...",ABC-6456-2020; DRA-0072-2022; I-4745-2019; DWU...,2010,JOURNAL OF MOLECULAR DIAGNOSTICS,12,916,917,2,,1525-1578,,,0,,https://www.webofscience.com/api/gateway?GWVer...,,,
3,WOS:000283973700278,Optimization of the Asuragen Human FMR 1 PCR A...,"Bentley, HA; Lebel, KA; Tyropolis, AM; Pfluege...",EOJ-3571-2022; FKT-6048-2022; EAA-6124-2022; F...,2010,JOURNAL OF MOLECULAR DIAGNOSTICS,12,916,916,1,,1525-1578,,,0,,https://www.webofscience.com/api/gateway?GWVer...,,,
4,WOS:000283973700280,Evaluation of the Luminex® xTAG™ CYP2D6 v2 Assay,"Lefferts, CL; Lee, HK; Lewis, LD; Hicks, ND; L...",DCS-4537-2022; DDA-0805-2022; DWU-0854-2022; G...,2010,JOURNAL OF MOLECULAR DIAGNOSTICS,12,917,917,1,,1525-1578,,,0,,https://www.webofscience.com/api/gateway?GWVer...,,,


In [17]:
df.to_csv(f"../hidden-data/wos_results_{SEARCH_QUERY}_{start_num}-{end_num}.csv", encoding = 'utf=8')

[{'uid': 'WOS:000241030900077',
  'title': 'A practical method for the synthesis of indolylaryl- and bisindolylmaleimides',
  'types': ['Article'],
  'sourceTypes': ['Article'],
  'source': {'sourceTitle': 'ORGANIC LETTERS',
   'publishYear': 2006,
   'publishMonth': 'OCT 12',
   'volume': '8',
   'issue': '21',
   'pages': {'range': '4975-4977',
    'begin': '4975',
    'end': '4977',
    'count': 3}},
  'names': {'authors': [{'displayName': 'Roy, Sudipta',
     'wosStandard': 'Roy, S',
     'researcherId': 'FZC-0592-2022'},
    {'displayName': 'Roy, Sujata',
     'wosStandard': 'Roy, S',
     'researcherId': 'AAY-4660-2020'},
    {'displayName': 'Gribble, Gordon W.',
     'wosStandard': 'Gribble, GW',
     'researcherId': 'EYV-1466-2022'}]},
  'links': {'record': 'https://www.webofscience.com/api/gateway?GWVersion=2&SrcApp=dartrds_jeremy_01&SrcAuth=WosAPI&KeyUT=WOS:000241030900077&DestLinkType=FullRecord&DestApp=WOS_CPL',
   'citingArticles': 'https://www.webofscience.com/api/gateway