In [7]:
import requests
import time
import os
import urllib.parse
import pandas as pd
import random
from bs4 import BeautifulSoup   #for parsing xml and html
from random import randint  
from dotenv import load_dotenv 
load_dotenv()

True

In [8]:
BASEURL_ST = 'https://api.clarivate.com/apis/wos-starter/v1/'
HEADERS_ST = {'X-APIKey': os.getenv("APIKEY")}

In [9]:
SEARCH_QUERY = "OG=(Dartmouth College)"
initial_request = requests.get(f'{BASEURL_ST}documents?db=WOS&q={urllib.parse.quote(SEARCH_QUERY)}', headers=HEADERS_ST)
data = initial_request.json()
total_records = data['metadata']['total']
print(total_records)
data

77864


{'metadata': {'total': 77864, 'page': 1, 'limit': 10},
 'hits': [{'uid': 'WOS:000202856600007',
   'title': 'Contributions from the cryptogamic laboratory of Harvard University. XLIV. New or little known unicellular algae. I. Chlorocystis cohnii.',
   'types': ['Article'],
   'sourceTypes': ['Article'],
   'source': {'sourceTitle': 'BOTANICAL GAZETTE',
    'publishYear': 1900,
    'publishMonth': 'JUL-DEC',
    'volume': '30',
    'issue': '1-6',
    'pages': {'range': '100-U27', 'begin': '100', 'end': 'U27', 'count': 14}},
   'names': {'authors': [{'displayName': 'Moore, George Thomas',
      'wosStandard': 'Moore, GT',
      'researcherId': 'DGM-8444-2022'}]},
   'links': {'record': 'https://www.webofscience.com/api/gateway?GWVersion=2&SrcApp=dartrds_jeremy_01&SrcAuth=WosAPI&KeyUT=WOS:000202856600007&DestLinkType=FullRecord&DestApp=WOS_CPL',
    'citingArticles': 'https://www.webofscience.com/api/gateway?GWVersion=2&SrcApp=dartrds_jeremy_01&SrcAuth=WosAPI&KeyUT=WOS:000202856600007&De

In [10]:
def retrieve_all_data(datahits):
    #hits = [hit for hit in data['hits']]
    datalist = []
    for hit in datahits:
        datadict = {}
        datadict['uid'] = hit.get("uid", "")
        datadict['title'] = hit.get("title", "")
        datadict['authors'] = "; ".join([name.get('wosStandard') for name in hit.get("names").get("authors")])
        datadict['researcherIds'] = "; ".join([str(name.get('researcherId')) for name in hit.get("names").get("authors")])
        datadict['pubyear'] = hit.get("source").get("publishYear")
        datadict['source_title'] = hit.get("source").get("sourceTitle")
        datadict['volume'] = hit.get("source").get("volume")
        datadict['page_start'] = hit.get("source").get("pages").get("begin")
        datadict['page_end'] = hit.get("source").get("pages").get("end")
        datadict['page_count'] = hit.get("source").get("pages").get("count")
        identifiers = hit.get("identifiers")
        datadict['doi'] = identifiers.get("doi")
        datadict['issn'] = identifiers.get("issn")
        datadict['eissn'] = identifiers.get("eissn")
        datadict['isbn'] = identifiers.get("isbn")
        citations = hit.get("citations")
        if len(citations) > 0:   #for some reason the citations key stores a dict inside a list
            datadict["citation_counts"] = citations[0].get("count")
         
        datadict['author_keywords'] = "; ".join([kw.lower() for kw in hit.get("keywords").get("authorKeywords")])
        #datadict['keywords_plus'] = hit.get("keywords").get("keywordsPlus")
        links = hit.get("links")
        datadict['record_links'] = links.get("record")
        datadict['citing_links'] = links.get("citingArticles")
        datadict['reference_links'] = links.get("references")
        datadict['related_links'] = links.get("related")
        datalist.append(datadict)
    #print(datalist)
    return(pd.DataFrame(datalist))

In [15]:
print(f"Our current search query: {SEARCH_QUERY} returned {total_records} records.")
start_num = 0
end_num = 10000
if start_num > 0:
    start_page = (start_num - 1)// 50
else:
    start_page = 0
end_page = (end_num - 1) // 50
print("retrieving pages: ", start_page, end_page)

requests_required = ((total_records - 1) // 50) + 1  #306 records - 1 = 305 // 50 = 6 + 1 = 7
print(requests_required)
datadict = {}
if requests_required > 1:
    print(f"API requests required to get all data from the query - '{SEARCH_QUERY}': {requests_required}")
#for i in range(requests_required):
if end_page > requests_required:
    end_page = requests_required
for i in range(start_page, end_page):
    subsequent_response = requests.get(
        f'{BASEURL_ST}documents?db=WOS&q={SEARCH_QUERY}&limit=50&page={i+1}', headers=HEADERS_ST)
    data = subsequent_response.json()
    if i == 0:
        print(data['metadata'])
        datadict = data
    else:
        datadict['hits'].extend(data['hits'])
    print(f"**Pulling from Page {i+1} of {requests_required}**")
print(f"Total number of records pulled: {len(datadict['hits'])}")
uids = set([hit['uid'] for hit in datadict['hits']])
print(f"Total number of unique ids: {len(uids)}")
print(f"Number of requests remaining today: {subsequent_response.headers['X-RateLimit-Remaining-Day']}.")   

Our current search query: OG=(Dartmouth College) returned 77864 records.
retrieving pages:  0 199
1558
API requests required to get all data from the query - 'OG=(Dartmouth College)': 1558
{'total': 77864, 'page': 1, 'limit': 50}
**Pulling from Page 1 of 1558**
**Pulling from Page 2 of 1558**
**Pulling from Page 3 of 1558**
**Pulling from Page 4 of 1558**
**Pulling from Page 5 of 1558**
**Pulling from Page 6 of 1558**
**Pulling from Page 7 of 1558**
**Pulling from Page 8 of 1558**
**Pulling from Page 9 of 1558**
**Pulling from Page 10 of 1558**
**Pulling from Page 11 of 1558**
**Pulling from Page 12 of 1558**
**Pulling from Page 13 of 1558**
**Pulling from Page 14 of 1558**
**Pulling from Page 15 of 1558**
**Pulling from Page 16 of 1558**
**Pulling from Page 17 of 1558**
**Pulling from Page 18 of 1558**
**Pulling from Page 19 of 1558**
**Pulling from Page 20 of 1558**
**Pulling from Page 21 of 1558**
**Pulling from Page 22 of 1558**
**Pulling from Page 23 of 1558**
**Pulling from Page 

In [None]:
df = retrieve_all_data(datadict['hits'])
df.head()

NameError: name 'datadict' is not defined

In [None]:
df.to_csv(f"wos_results_{SEARCH_QUERY}_{start_num}-{end_num}.csv", encoding = 'utf=8')