In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import yaml

import os
import wos
import wos_parser
import metaknowledge as mk

import wosis
import pandas as pd

In [3]:
with open("config.yml") as config:
    wos_config = yaml.load(config)
    wos_config = wos_config['wos']

In [4]:
# dollar sign ($) is a wildcard used to match British and American spellings
# e.g. flavo$r matches both flavour and flavor
# see https://images.webofknowledge.com/images/help/WOS/hs_wildcards.html
inclusive_search_terms = ("Integrated Assessment and Model$ing", "integrated environmental model$ing", 
                          "integrated assessment", "integrated model$ing", "environmental model$ing", 
                          "hydrological model$ing", "ecological model$ing", "ecosystem model$ing", "model$ing framework",
                           "framework development", "model integration")
exclusive_search_terms = ("chemical", "industry", "cancer", "gene")
subject_area = ("ENVIRONMENTAL SCIENCES", "WATER RESOURCES", "ENGINEERING ENVIRONMENTAL", "INTERDISCIPLINARY APPLICATIONS")

In [5]:
topics = [wosis.build_query(inclusive_search_terms, exclusive_search_terms, subject_area), ]

In [6]:
topics

['TS=(("Integrated Assessment and Model$ing" OR "integrated environmental model$ing" OR "integrated assessment" OR "integrated model$ing" OR "environmental model$ing" OR "hydrological model$ing" OR "ecological model$ing" OR "ecosystem model$ing" OR "model$ing framework" OR "framework development" OR "model integration") NOT ("chemical" OR "industry" OR "cancer" OR "gene")) AND WC=("ENVIRONMENTAL SCIENCES" OR "WATER RESOURCES" OR "ENGINEERING ENVIRONMENTAL" OR "INTERDISCIPLINARY APPLICATIONS")']

See [field tags](http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query/field_tags/WOSfieldTags.html) to help construct search parameters
    

## Send query to Clarivate Web of Science servers

In [7]:
%%time
overwrite = False
result_hash = wosis.query(topics, overwrite, wos_config)

Authenticated (SID: 5BYkb42wwCwHaeCMiZp)




Got 8966 records
Wall time: 4.45 s


In [8]:
import json

with open('tmp/hash_to_query.txt', 'w') as file:
     file.write(json.dumps(result_hash, indent=2))  # use `json.loads` to do the reverse

In [9]:
query_id = list(result_hash.keys())[0]

In [10]:
RC = mk.RecordCollection(f"tmp/{query_id}.txt")
RC = RC.yearSplit(0, 2017)
print("Corpora consists of", len(RC), "Publications")

Corpora consists of 8286 Publications


In [11]:
from datetime import datetime

corpora = RC.forNLP(extraColumns=["AU", "SO", "DE"])
corpora_df = pd.DataFrame(corpora)
corpora_df.drop(["keywords", "abstract"], inplace=True, axis=1)

corpora_fn = f"data/{query_id}.csv"
with open(corpora_fn, 'w') as fn:
    fn.write("# Data from Clarivate Analytics' Web of Science, retrieved 29 October 2018\n")
    fn.write("# This file generated on {}\n".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
    corpora_df.index.name = "item"
    corpora_df.to_csv(fn)