### Generating publication metadata with APIs

#### Dimensions md

In [1]:
!jupyter nbconvert --to script get_api_md.ipynb

[NbConvertApp] Converting notebook get_api_md.ipynb to script
[NbConvertApp] Writing 2190 bytes to get_api_md.py


In [2]:
import RichContextAPI


In [18]:
import configparser
import dimensions_search_api_client as dscli

In [19]:
def connect_ds_api(username,password):
    api_client = dscli.DimensionsSearchAPIClient()
    api_client.set_max_in_items( 100 )
    api_client.set_max_return( 1000 )
    api_client.set_max_overall_returns( 50000 )
    api_client.set_username( username )
    api_client.set_password( password )
    return api_client

def connect_dimensions_api():
    CONFIG = configparser.ConfigParser()
    CONFIG.read("richcontext_config.cfg")
    api_client = connect_ds_api(username= CONFIG.get('DEFAULT','username'),password = CONFIG.get('DEFAULT','password'))
    return api_client

In [20]:
def dimensions_title_test():
    title = 'Relationships between Diet, Alcohol Preference, and Heart Disease and Type 2 Diabetes among Americans'
    query = 'search publications in title_only for "\\"{}\\"" return publications[all]'.format(title)
    dimensions_return = api_client.execute_query(query_string_IN = query )
    if dimensions_return:
        title_return = dimensions_return['publications']
        if len(title_return) > 0:
            print('passed test')
    if not dimensions_return:
        print('failed')
            


In [21]:
api_client = connect_dimensions_api()

In [23]:
dimensions_title_test()

passed test


In [46]:
import urllib
import requests
from bs4 import BeautifulSoup

def gen_empc_url(title):
    epmc_url = 'http://europepmc.org/search?query=' + urllib.parse.quote(title)
    return epmc_url

def get_europepmc_metadata (url):
    """
    parse metadata from a Europe PMC web page for a publication
    """

    response = requests.get(url).text

    publisher = None
    doi = None
    pdf = None
    new_url = None

    soup = BeautifulSoup(response, "html.parser")


    publisher_list_pmcmata = soup.find_all("span", {"id": "pmcmata"})
    if len(publisher_list_pmcmata) > 0:
        for x in publisher_list_pmcmata:
            publisher = x.get_text()
    if len(publisher_list_pmcmata) == 0:
        publisher_list_citation = soup.find_all("meta", {"name": "citation_journal_abbrev"})
        if len(publisher_list_citation) > 0:
            for x in publisher_list_citation:
                publisher = x['content']
    for x in soup.find_all("meta",  {"name": "citation_title"}):
        title = x['content']

    for x in soup.find_all("meta",  {"name": "citation_doi"}):
        doi = x["content"]

    for x in soup.find_all("meta",  {"name": "citation_pdf_url"}):
        pdf = x["content"]

    for x in soup.find_all("a",  {"class": "abs_publisher_link"}):
        new_url = x['href']

    if title:
        epmc_data = {'title':title}
    if doi:
        epmc_data.update({'doi':doi})
    if publisher:
        epmc_data.update({'journal':publisher})
    if pdf:
        epmc_data.update({'pdf':pdf})
    if new_url:
        epmc_data.update({'url':new_url})
        return epmc_data
    else:
        return None
    
    


In [47]:
url = "http://europepmc.org/abstract/MED/20195444"

In [48]:
epmc_md = get_europepmc_metadata(url)

In [49]:
epmc_md

{'title': 'Categorizing US state drinking practices and consumption trends.',
 'doi': '10.3390/ijerph7010269',
 'journal': 'Int J Environ Res Public Health',
 'url': 'https://doi.org/10.3390/ijerph7010269'}

In [3]:
title = 'Relationships between Diet, Alcohol Preference, and Heart Disease and Type 2 Diabetes among Americans'
dimensions_md = RichContextAPI.get_dimensions_md(title)

In [7]:
dimensions_md.keys()

dict_keys(['linkout', 'authors', 'doi', 'keywords', 'journal_title', 'title'])

#### SSRN md

In [10]:
title = 'Modeling the Term Structure from the On-the-Run Treasury Yield Curve'
ssrn_md = RichContextAPI.get_ssrn_md(title)

#### Europe PMC

In [None]:
import RichContextAPI
title = "Categorizing US State Drinking Practices and Consumption Trends"
page_data = RichContextAPI.get_epmc_md(title)

#### OpenAire

In [4]:
import RichContextAPI
title = "Categorizing US State Drinking Practices and Consumption Trends"
oa_d = RichContextAPI.oa_lookup_pub_uris(title)

In [56]:
from urllib import parse
import xml.etree.ElementTree as et

def oa_load_uri (uri):
    with urllib.request.urlopen(uri) as response:
        html = response.read()
        return html.decode("utf-8")
    

API_URI = "http://api.openaire.eu/search/publications?title="

def oa_lookup_pub_uris (title):
    xml = oa_load_uri(API_URI + parse.quote(title))
    pub_url = oa_extract_pub_uri(xml)
    journal = oa_extract_journal(xml)
    doi = oa_extract_doi(xml)

    if pub_url:
        oa_dict = {'journal':journal,'title':title,'doi':doi}
        oa_dict.update(pub_url)
        return oa_dict
    if not pub_url:
        return None
    
    


NS = {
    "oaf": "http://namespace.openaire.eu/oaf"
    }

def oa_extract_pub_uri (xml):
    root = et.fromstring(xml)
    result = root.findall("./results/result[1]/metadata/oaf:entity/oaf:result", NS)

    if len(result) > 0:
        url_list = result[0].findall("./children/instance/webresource/url")

        if len(url_list) > 0:
            url_list_text = [u.text for u in url_list]
            pdf = [p for p in url_list_text if 'pdf' in p]
            url = [u for u in url_list_text if u not in pdf and 'europepmc' in u]
            url_dict = {}
            if len(url) > 0:
                url_dict.update({'url':url[0]})
            if len(pdf) > 0:
                url_dict.update({'pdf':pdf[0]})
            return url_dict

    return None

def oa_extract_publisher (xml):
    root = et.fromstring(xml)
    result = root.findall("./results/result[1]/metadata/oaf:entity/oaf:result", NS)
    if len(result) > 0:
        publisher_list = result[0].findall("./collectedfrom")
        if len(publisher_list) > 0:
            publisher_name = publisher_list[0].attrib['name']
            return publisher_name
    elif len(result) == 0:
        return None
    
    
def oa_extract_doi (xml):
    root = et.fromstring(xml)
    result = root.findall("./results/result[1]/metadata/oaf:entity/oaf:result", NS)
    if len(result) > 0:
        doi = result[0].find("./pid[@classid='doi']")
        if doi is not None:
            doi = doi.text
            return doi

def oa_extract_journal (xml):
    root = et.fromstring(xml)
    result = root.findall("./results/result[1]/metadata/oaf:entity/oaf:result", NS)
    if len(result) > 0:
        journal = result[0].find("./journal")
        if journal is not None:
            journal_name = journal.text
            return journal_name

In [57]:
title = "Categorizing US State Drinking Practices and Consumption Trends"
oa_d = oa_lookup_pub_uris(title)

In [58]:
oa_d

{'journal': 'International Journal of Environmental Research and Public Health',
 'title': 'Categorizing US State Drinking Practices and Consumption Trends',
 'doi': '10.3390/ijerph7010269',
 'url': 'http://europepmc.org/articles/PMC2819787'}

### Running publications through APIs

In [15]:
import RichContextAPI
import importlib
import json
import time
import os
importlib.reload(RichContextAPI)

<module 'RichContextAPI' from '/Users/sophierand/RCApi/RichContextAPI.py'>

In [16]:
pub_path = '/Users/sophierand/RCPublications/partitions/20190610_usda_iri_publications.json'

In [17]:
with open(pub_path) as json_file:
    publications = json.load(json_file)

In [30]:
p = publications[0]
# p

In [26]:
def get_publication_metadata(p):
    title = p['title']
    try:
        epmc_md = RichContextAPI.get_epmc_md(title)
        p.update({'europepmc':epmc_md})
        print('found metadata in europe pmc')
    except:
        pass
    try:
        oa_md = RichContextAPI.oa_lookup_pub_uris(title)
        p.update({'openaire':oa_md})
        print('found metadata in openaire')
    except:
        pass
    try:
        ssrn_md = RichContextAPI.get_ssrn_md(title)
        p.update({'ssrn':ssrn_md})
        print('found metadata in ssrn')
    except:
        pass
    try:
        dimensions_md = RichContextAPI.get_dimensions_md(title)
        p.update({'dimensions':dimensions_md})
        print('found metadata in dimensions')
    except:
        pass
    return p

found metadata in europe pmc
found metadata in openaire
found metadata in dimensions


In [29]:
# p.keys()

In [None]:
# epmc_md = RichContextAPI.get_epmc_md(title)
# oa_md = RichContextAPI.oa_lookup_pub_uris(title)
# ssrn_md = RichContextAPI.get_ssrn_md(title)
# dimensions_md = RichContextAPI.get_dimensions_md(title)