In [1]:
# Common packages and variables
%run "Common Setup.ipynb"

In [2]:
# dollar sign ($) is a wildcard used to match British and American spellings
# e.g. flavo$r matches both flavour and flavor
# see https://images.webofknowledge.com/images/help/WOS/hs_wildcards.html
# http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query.html
search_query = ["""
TS=(("Integrated Assessment and Model$ing" OR "integrated environmental model$ing" 
     OR "integrated assessment" OR "integrated model$ing" OR "environmental model$ing" 
     OR "hydrological model$ing" OR "ecological model$ing" OR "ecosystem model$ing" 
     OR "model$ing framework" OR "framework development" OR "model integration") 
NOT ("chemical" OR "industry" OR "cancer" OR "gene")) 
NOT SO=("PSYCHOL*" OR "BIOINFO*" OR "BUSINESS INFORMATION*" OR "MANUFACTURING*" 
        OR "BIOLOGICALLY INSPIRED COGNITIVE ARCHITECTURES*" OR "COMPLEXITY*" 
        OR "INDUSTRIAL ECOLOGY*" OR "QUANTITATIVE FINANCE" OR "VIRTUAL REALITY*" 
        OR "COMMUNICATION NETWORKS*" OR "COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE" 
        OR "ARTIFICIAL INTELLIGENCE IN EDUCATION" OR "INSURANCE MATHEMATICS & ECONOMICS" 
        OR "ASTIN BULLETIN" OR "COMMUNICATION TECHNOLOGY" OR "COMPUTERS & STRUCTURES" 
        OR "CHEMOSPHERE" OR "VISUAL COMPUTING" OR "INTERNATIONAL CONFERENCE ON COMPUTER SCIENCE AND APPLICATIONS" 
        OR "WASTE MANAGEMENT*" OR "OPTIMIZATION AND ENGINEERING" OR "COMPUTERS & OPERATIONS RESEARCH")
AND WC=("ENVIRONMENTAL SCIENCES" OR "WATER RESOURCES" OR "ENGINEERING ENVIRONMENTAL" OR "INTERDISCIPLINARY APPLICATIONS")
AND SU=("Environmental Sciences & Ecology" OR "Water Resources" OR "Computer Science")
"""]

See [field tags](http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query/field_tags/WOSfieldTags.html) to help construct search parameters
    

## Send query to Clarivate Web of Science servers

In [3]:
%%time
overwrite = False  # Do not overwrite previous identical search if it exists
search_span = {
    'begin': '1970-01-01',
    'end': '2018-12-31'
}

id_to_query, query_recs = wosis.query(search_query, overwrite, WOS_CONFIG, time_span=search_span, tmp_dir=TMP_DIR, skip_refs=True)

Authenticated (SID: 5Fdzl2aG2XkdMVM1TOF)
Got 8622 records
Wall time: 10.4 s


The command above will save a cached copy of the ID to query used in the indicated temporary directory.
It can be loaded with the following snippet if desired:

```python
import json

with open('../tmp/hash_to_query.txt', 'r') as fn:
    records = json.load(fn)
```

Below I am saving a canonical version with the date to reuse later

In [4]:
import os

with open(os.path.join(TMP_DIR, '2019-02-24_hash_to_query.txt'), 'w') as fn:
    json.dump(id_to_query, fn, indent=2)

In [5]:
query_id = list(query_recs)[0]
RC = wosis.load_query_results("{}/{}.txt".format(TMP_DIR, query_id))

In [6]:
query_id

'9c6ab9828cde626ed770f2efd71c7f17_1970-01-01-2018-12-31'

In [7]:
len(RC)

8622

In [8]:
RC = RC.yearSplit(0, 2018)
print("Corpora consists of", len(RC), "Publications")

Corpora consists of 8622 Publications


# Create representative data store

Dataset for publication cannot include WoS generated keywords or abstract text. It must also include attribution text.

In [9]:
from datetime import datetime

corpora = RC.forNLP(extraColumns=["AU", "SO", "DE"])
corpora_df = pd.DataFrame(corpora)
corpora_df.drop(["keywords", "abstract"], inplace=True, axis=1)

assert len(corpora_df.id.unique()) == len(corpora_df.id), "Duplicate records found!"

corpora_fn = "../data/{}.csv".format(query_id)
with open(corpora_fn, 'w') as fn:
    fn.write("# Data from Clarivate Analytics' Web of Science, retrieved 24 February 2018\n")
    fn.write("# This file generated on {}\n".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
    corpora_df.index.name = "item"
    corpora_df.to_csv(fn)