In [2]:
%reload_ext autoreload
%autoreload 2

In [4]:
import wosis
import pandas as pd
import metaknowledge as mk

In [5]:
wos_config = wosis.load_config("config.yml")

In [6]:
# dollar sign ($) is a wildcard used to match British and American spellings
# e.g. flavo$r matches both flavour and flavor
# see https://images.webofknowledge.com/images/help/WOS/hs_wildcards.html
# http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query.html
search_terms = {
    "inclusive_kw": ("Integrated Assessment and Model$ing", "integrated environmental model$ing", 
                     "integrated assessment", "integrated model$ing", "environmental model$ing", 
                     "hydrological model$ing", "ecological model$ing", "ecosystem model$ing", "model$ing framework",
                     "framework development", "model integration"),
    "exclusive_kw": ("chemical", "industry", "cancer", "gene"),
    "exclusive_jo": ('PSYCHOL*', 'BIOINFO*', 'BUSINESS INFORMATION*', 'MANUFACTURING*', 
                     'BIOLOGICALLY INSPIRED COGNITIVE ARCHITECTURES*', 'COMPLEXITY*', 'INDUSTRIAL ECOLOGY*',
                     'QUANTITATIVE FINANCE', 'VIRTUAL REALITY*', 'COMMUNICATION NETWORKS*', 'COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE',
                     'ARTIFICIAL INTELLIGENCE IN EDUCATION', 'INSURANCE MATHEMATICS & ECONOMICS', 'ASTIN BULLETIN', 
                     'COMMUNICATION TECHNOLOGY', 'COMPUTERS & STRUCTURES', 'CHEMOSPHERE', 'VISUAL COMPUTING',
                     'INTERNATIONAL CONFERENCE ON COMPUTER SCIENCE AND APPLICATIONS', 'WASTE MANAGEMENT*', 'OPTIMIZATION AND ENGINEERING',
                     'COMPUTERS & OPERATIONS RESEARCH'),
    "subject_area": ("ENVIRONMENTAL SCIENCES", "WATER RESOURCES", "ENGINEERING ENVIRONMENTAL", "INTERDISCIPLINARY APPLICATIONS"),
}

In [7]:
topics = [wosis.build_query(search_terms), ]

In [8]:
topics

['TS=(("Integrated Assessment and Model$ing" OR "integrated environmental model$ing" OR "integrated assessment" OR "integrated model$ing" OR "environmental model$ing" OR "hydrological model$ing" OR "ecological model$ing" OR "ecosystem model$ing" OR "model$ing framework" OR "framework development" OR "model integration") NOT ("chemical" OR "industry" OR "cancer" OR "gene")) NOT SO=("PSYCHOL*" OR "BIOINFO*" OR "BUSINESS INFORMATION*" OR "MANUFACTURING*" OR "BIOLOGICALLY INSPIRED COGNITIVE ARCHITECTURES*" OR "COMPLEXITY*" OR "INDUSTRIAL ECOLOGY*" OR "QUANTITATIVE FINANCE" OR "VIRTUAL REALITY*" OR "COMMUNICATION NETWORKS*" OR "COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE" OR "ARTIFICIAL INTELLIGENCE IN EDUCATION" OR "INSURANCE MATHEMATICS & ECONOMICS" OR "ASTIN BULLETIN" OR "COMMUNICATION TECHNOLOGY" OR "COMPUTERS & STRUCTURES" OR "CHEMOSPHERE" OR "VISUAL COMPUTING" OR "INTERNATIONAL CONFERENCE ON COMPUTER SCIENCE AND APPLICATIONS" OR "WASTE MANAGEMENT*" OR "OPTIMIZATION AND ENGINEERING" O

See [field tags](http://ipscience-help.thomsonreuters.com/wosWebServicesLite/WebServiceOperationsGroup/WebServiceOperations/g2/user_query/field_tags/WOSfieldTags.html) to help construct search parameters
    

## Send query to Clarivate Web of Science servers

In [9]:
%%time
overwrite = False  # Do not overwrite previous identical search if it exists
id_to_query, query_recs = wosis.query(topics, overwrite, wos_config)

Authenticated (SID: 8EeXiW8kXX9LghbPeZK)




Got 8999 records
Wall time: 9.34 s


In [10]:
import json

with open('tmp/hash_to_query.txt', 'w') as file:
     file.write(json.dumps(id_to_query, indent=2))  # use `json.loads` to do the reverse

In [11]:
query_id = list(id_to_query)[0]

In [13]:
# RC = list(query_recs.keys())[0]
RC = mk.RecordCollection("tmp/{}.txt".format(query_id))
RC = RC.yearSplit(0, 2017)
print("Corpora consists of", len(RC), "Publications")

Corpora consists of 8201 Publications


# Create representative data store

Dataset for publication cannot include WoS generated keywords or abstract text. It must also include attribution text.

In [14]:
from datetime import datetime

corpora = RC.forNLP(extraColumns=["AU", "SO", "DE"])
corpora_df = pd.DataFrame(corpora)
corpora_df.drop(["keywords", "abstract"], inplace=True, axis=1)

assert len(corpora_df.id.unique()) == len(corpora_df.id), "Duplicate records found!"

corpora_fn = "../data/{}.csv".format(query_id)
with open(corpora_fn, 'w') as fn:
    fn.write("# Data from Clarivate Analytics' Web of Science, retrieved 1 December 2018\n")
    fn.write("# This file generated on {}\n".format(datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
    corpora_df.index.name = "item"
    corpora_df.to_csv(fn)