# API data extraction

Extract journal articles relevant to ocean basins.

Use Web of Science Expanded API. An API key is required for API searches.

See https://developer.clarivate.com/apis for details.

Limits apply to API searches (e.g., limit of 100,000 search items)

Note that code below is only an example script and must be altered for specific queries.

Returns a CSV file of data.

## Preamble

In [None]:
import requests
import time
from tqdm import tqdm
import sys
import json
from datetime import date
import numpy as np
import pandas as pd
from collections import defaultdict


In [None]:
# Specify years of data to extract
export_years = np.arange(2000,2021,1).tolist()

In [None]:
# write out extract file with today's date
date_string = date.today().strftime("%Y-%m-%d")
date_string

In [None]:
#API key needed to carry out API searches
key = "{}".format(api_key)

url = "https://wos-api.clarivate.com/api/wos/"

## Basin search terms

In [None]:
atlantic_query = """(TI=(
"atlantic ocean" OR "atlantic" OR "Mid-Atlantic" OR "Sargasso Sea" OR "Gulf of Mexico" OR "Caribbean" OR "Labrador Sea" 
OR "Argentine Sea" OR "Hudson Bay" OR "Gulf of Maine" OR "Gulf of St Lawrence" OR "Gulf of St. Lawrence" OR
"Gulf of Saint Lawrence" OR "Gulf Stream" OR "Florida Current" OR "Brazil Current" OR "Labrador Current" OR
"North Sea" OR "Mediterranean Sea" OR "Gulf of Guinea" OR "Bay of Biscay" OR "Celtic Sea" OR "Benguela Current"
OR "Canary Current") NOT TI=("mid-atlantic states" OR "mid atlantic states"))

AND 

(AB=("atlantic ocean" OR "atlantic" OR "Mid-Atlantic" OR "Sargasso Sea" OR "Gulf of Mexico" OR "Caribbean" OR "Labrador Sea"
OR "Argentine Sea" OR "Hudson Bay" OR "Gulf of Maine" OR "Gulf of St Lawrence" OR "Gulf of St. Lawrence" OR
"Gulf of Saint Lawrence" OR "Gulf Stream" OR "Florida Current" OR "Brazil Current" OR "Labrador Current" OR
"North Sea" OR "Mediterranean Sea" OR "Gulf of Guinea" OR "Bay of Biscay" OR "Celtic Sea" OR "Benguela Current"
OR "Canary Current") NOT TI=("mid-atlantic states" OR "mid atlantic states")) 

AND
(DT==("ARTICLE"))
AND 
(PY=({}))
AND 
(EDN==("WOS.SCI" OR "WOS.SSCI" OR "WOS.AHCI" OR "WOS.ESCI"))""".format(" OR ".join([str(x) for x in export_years]))

In [None]:
atlantic_query

In [None]:
pacific_query ="""(TI=(
"Eastern Pacific" OR
"Chilean Sea" OR
"Gulf of Alaska" OR
"Mar de Grau" OR
"Gulf of California" OR
"Alaska Current" OR
"California Current" OR
"Western Pacific" OR
"Philippine Sea" OR
"South China Sea" OR
"Tasman Sea" OR
"East China Sea" OR
"Coral Sea" OR
"Australian Mediterranean Sea" OR
"Sea of Okhotsk" OR
"Sea of Japan" OR
"Solomon Sea" OR
"Banda Sea" OR
"Arafura Sea" OR
"Timor Sea" OR
"Yellow Sea" OR
"Java Sea" OR
"Gulf of Thailand" OR
"Gulf of Carpentaria" OR
"Celebes Sea" OR
"Sulu Sea" OR
"Bismarck Sea" OR
"Gulf of Anadyr" OR
"Molucca Sea" OR
"Indonesian Throughflow" OR
"Kurushio Current" OR
"Oyashio Current" OR
"Central Pacific" OR
"Pacific Ocean" OR
"Pacific" OR
"Bering Sea" OR
"North Pacific gyre" OR
"South Pacific gyre"))

AND

(AB=(
"Eastern Pacific" OR
"Chilean Sea" OR
"Gulf of Alaska" OR
"Mar de Grau" OR
"Gulf of California" OR
"Alaska Current" OR
"California Current" OR
"Western Pacific" OR
"Philippine Sea" OR
"South China Sea" OR
"Tasman Sea" OR
"East China Sea" OR
"Coral Sea" OR
"Australian Mediterranean Sea" OR
"Sea of Okhotsk" OR
"Sea of Japan" OR
"Solomon Sea" OR
"Banda Sea" OR
"Arafura Sea" OR
"Timor Sea" OR
"Yellow Sea" OR
"Java Sea" OR
"Gulf of Thailand" OR
"Gulf of Carpentaria" OR
"Celebes Sea" OR
"Sulu Sea" OR
"Bismarck Sea" OR
"Gulf of Anadyr" OR
"Molucca Sea" OR
"Indonesian Throughflow" OR
"Kurushio Current" OR
"Oyashio Current" OR
"Central Pacific" OR
"Pacific Ocean" OR
"Pacific" OR
"Bering Sea" OR
"North Pacific gyre" OR
"South Pacific gyre"))

AND
(DT==("ARTICLE"))
AND 
(PY=({}))
AND 
(EDN==("WOS.SCI" OR "WOS.SSCI" OR "WOS.AHCI" OR "WOS.ESCI"))""".format(" OR ".join([str(x) for x in export_years]))

In [None]:
pacific_query

In [None]:
arctic_query = """(TI=(
"Arctic Ocean" OR
"Barents Sea" OR
"Hudson Bay" OR
"Chukchi Sea" OR
"Greenland Sea" OR
"East Siberian Sea" OR
"Kara Sea" OR
"Laptev Sea" OR
"Beaufort Sea" OR
"Amundsen Gulf" OR
"White Sea" OR
"Pechora Sea" OR
"Lincoln Sea" OR
"Beaufort Gyre" OR
"Baffin Bay" OR
"Denmark Strait" OR
"Davis Strait" OR
"Bering Strait" OR
"East Greenland Current" OR
"E. Greenland Current" OR
"Norwegian Current"))

AND

(AB=(
"Arctic Ocean" OR
"Barents Sea" OR
"Hudson Bay" OR
"Chukchi Sea" OR
"Greenland Sea" OR
"East Siberian Sea" OR
"Kara Sea" OR
"Laptev Sea" OR
"Beaufort Sea" OR
"Amundsen Gulf" OR
"White Sea" OR
"Pechora Sea" OR
"Lincoln Sea" OR
"Beaufort Gyre" OR
"Baffin Bay" OR
"Denmark Strait" OR
"Davis Strait" OR
"Bering Strait" OR
"East Greenland Current" OR
"E. Greenland Current" OR
"Norwegian Current"))

AND
(DT==("ARTICLE"))
AND 
(PY=({}))
AND 
(EDN==("WOS.SCI" OR "WOS.SSCI" OR "WOS.AHCI" OR "WOS.ESCI"))""".format(" OR ".join([str(x) for x in export_years]))

In [None]:
arctic_query

In [None]:
southern_query = """(TI=(
"Southern Ocean" OR
"Weddell Sea" OR
"Ross Sea" OR
"Scotia Sea" OR
"Somov Sea" OR
"Lazarev Sea" OR
"Amundsen Sea" OR
"Riiser-Larsen Sea" OR
"Cosmonauts Sea" OR
"Bellinghausen Sea" OR
"Mawson Sea" OR
"Cooperation Sea" OR
"Davis Sea" OR
"Antarctic Circumpolar Current" OR
"Great Australian Bight"))

AND

(AB=(
"Southern Ocean" OR
"Weddell Sea" OR
"Ross Sea" OR
"Scotia Sea" OR
"Somov Sea" OR
"Lazarev Sea" OR
"Amundsen Sea" OR
"Riiser-Larsen Sea" OR
"Cosmonauts Sea" OR
"Bellinghausen Sea" OR
"Mawson Sea" OR
"Cooperation Sea" OR
"Davis Sea" OR
"Antarctic Circumpolar Current" OR
"Great Australian Bight"))

AND
(DT==("ARTICLE"))
AND 
(PY=({}))
AND 
(EDN==("WOS.SCI" OR "WOS.SSCI" OR "WOS.AHCI" OR "WOS.ESCI"))""".format(" OR ".join([str(x) for x in export_years]))

In [None]:
southern_query

In [None]:
indian_query = """(TI=(
"Indian Ocean" OR
"Bay of Bengal" OR
"Arabian Sea" OR
"Andaman Sea" OR
"Laccadive Sea" OR
"Mozambique Channel" OR
"Timor Sea" OR
"Red Sea" OR
"Gulf of Aden" OR
"Persian Gulf" OR
"Flores Sea" OR
"Molucca Sea" OR
"Oman Sea" OR
"Agulhas Current" OR
"Mozambique Current" OR
"West Australian Current" OR
"Somali Current" OR
"India Coastal Current" OR
"The Great Whirl" OR
"Socotra Gyre" OR
"Sri Lanka Dome" OR
"Thermocline Ridge"))

AND

(AB=(
"Indian Ocean" OR
"Bay of Bengal" OR
"Arabian Sea" OR
"Andaman Sea" OR
"Laccadive Sea" OR
"Mozambique Channel" OR
"Timor Sea" OR
"Red Sea" OR
"Gulf of Aden" OR
"Persian Gulf" OR
"Flores Sea" OR
"Molucca Sea" OR
"Oman Sea" OR
"Agulhas Current" OR
"Mozambique Current" OR
"West Australian Current" OR
"Somali Current" OR
"India Coastal Current" OR
"The Great Whirl" OR
"Socotra Gyre" OR
"Sri Lanka Dome" OR
"Thermocline Ridge"))

AND
(DT==("ARTICLE"))
AND 
(PY=({}))
AND 
(EDN==("WOS.SCI" OR "WOS.SSCI" OR "WOS.AHCI" OR "WOS.ESCI"))""".format(" OR ".join([str(x) for x in export_years]))

In [None]:
indian_query

In [None]:
#Choose ocean basins to be searched. Need a list of ocean names and queries

ocean_list = ['atlantic']
ocean_queries = [atlantic_query]


## RUN API search

Produces a dataframe of relevant articles

In [None]:
records_final = pd.DataFrame()

for ocean, query in zip(ocean_list, ocean_queries):
    print(ocean)

    batch_size=100

    records = []

    cursor = 1
    while True:
        sys.stdout.write('{:,}\r'.format(cursor))
        sys.stdout.flush()          
        params = {
            'databaseId': 'WOS',
            'usrQuery': query,
            'count': 100,
            'firstRecord': cursor,
        }    
        r = requests.get(
            url,
            headers={'X-ApiKey': key},
            params=params,
        )
        if r.status_code == 200:
            data = r.json()
            result_count = len(data['Data']['Records']['records']['REC'])
            for i, result in enumerate(data['Data']['Records']['records']['REC']):

                ut = result['UID']
                paper_title = None
                journal = None
                for title_record in result['static_data']['summary']['titles']['title']:
                    if title_record['type'] == 'item':
                        paper_title = title_record['content']
                    elif title_record['type'] == 'source_abbrev':
                        journal = title_record['content']

                abstract = None
                if 'abstracts' in result['static_data']['fullrecord_metadata'] and result['static_data']['fullrecord_metadata']['abstracts']['count'] == 1:
                    abstract = result['static_data']['fullrecord_metadata']['abstracts']['abstract']['abstract_text']['p']
                if abstract and isinstance(abstract, list):
                    abstract = " ".join([str(x) for x in abstract])

                woscats = []
                if 'category_info' in result['static_data']['fullrecord_metadata']:
                    for category_record in result['static_data']['fullrecord_metadata']['category_info']['subjects']['subject']:
                        if category_record['ascatype'] == 'traditional':
                            woscats.append(category_record['content'])

                keywords = []
                if 'keywords' in result['static_data']['fullrecord_metadata']:
                    try:
                        keywords = [str(x) for x in result['static_data']['fullrecord_metadata']['keywords']['keyword']]
                    except: #as some dont have keywords, getting int object isn't iterable errors
                        print(ut)

                countries = set()
                orgs = set()
                if 'addresses' in result['static_data']['fullrecord_metadata'] and 'address_name' in result['static_data']['fullrecord_metadata']['addresses']:
                    if isinstance(result['static_data']['fullrecord_metadata']['addresses']['address_name'], list):
                        for address_record in result['static_data']['fullrecord_metadata']['addresses']['address_name']:
                            country = address_record['address_spec']['country']
                            countries.add(country)

                            if 'organizations' in address_record['address_spec'] and isinstance(address_record['address_spec']['organizations'], dict):
                                org = address_record['address_spec']['organizations']['organization']
                                if isinstance(org, list):
                                    # unified
                                    orgs.add(org[0]['content'])
                                else:
                                    orgs.add(org)
                            elif 'organizations' in address_record['address_spec']:
                                for org_record in address_record['address_spec']['organizations']:
                                    org = org_record['organization']
                                    orgs.add(org)
                    else:
                        address_record = result['static_data']['fullrecord_metadata']['addresses']['address_name']
                        country = address_record['address_spec']['country']
                        countries.add(country)

                        if 'organizations' in address_record['address_spec'] and isinstance(address_record['address_spec']['organizations'], dict):
                            org = address_record['address_spec']['organizations']['organization']
                            if isinstance(org, list):
                                # unified
                                orgs.add(org[0]['content'])
                            else:
                                orgs.add(org)
                        elif 'organizations' in address_record['address_spec']:
                            for org_record in address_record['address_spec']['organizations']:
                                org = org_record['organization']
                                orgs.add(org)                                 

                doi = None

                try:
                    identifiers = result['dynamic_data']['cluster_related']['identifiers']['identifier']
                    if isinstance(identifiers, list):
                        for id_rec in result['dynamic_data']['cluster_related']['identifiers']['identifier']:
                            if id_rec['type'] == 'doi':
                                doi = id_rec['value']
                    else:
                        if id_rec['type'] == 'doi':
                            doi = id_rec['value']
                except:
                    print('no identifiers',ut)


                pub_info_record = result['static_data']['summary']['pub_info']
                year = str(pub_info_record['pubyear'])
                doctype = str(result['static_data']['summary']['doctypes']['doctype'])
                j20 = str(result['static_data']['summary']['titles']['title'][1]['content'])
                wos_cites = result['dynamic_data']['citation_related']['tc_list']['silo_tc']['local_count']

                citation_count = None
                cite_count_rec = result['dynamic_data']['citation_related']['tc_list']['silo_tc']
                if cite_count_rec['coll_id'] == 'WOS':
                    citation_count = cite_count_rec['local_count']

                records.append({
                    'ut': ut,
                    'doi':doi,
                    'source_year':year,
                    'doc_type':doctype,
                    'title': paper_title,
                    'journal': journal,
                    'j20':j20,
                    'abstract': abstract,               
                    'keywords': "|".join(keywords),
                    'woscats': "|".join(woscats),
                    'citation count': citation_count,
                    'countries': "|".join(countries),
                    'ocean':ocean,
                    'orgs': "|".join(orgs)
                 })
            if result_count == batch_size:
                cursor += batch_size
            else:
                break
        else:
            print("Error")
            print(r)
            break

        time.sleep(1)
    
    temp_df = pd.DataFrame(records)
    #print(temp_df)
        
    records_final = records_final.append(temp_df)
    
records_final

In [None]:
#Name file in appropriate mannner
records_final.to_csv("{}atlantic+pacific_{}-{}-{}_extracted.csv".format(path,min(export_years),max(export_years),date_string),index=False)