In [49]:
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from collections import defaultdict
from habanero import Crossref, counts, cn
import datetime

In [50]:
cr = Crossref()

In [51]:
def metadata_retriever(cr, DOI):
    """
    Retrieves CrossRef metadata when an DOI is provided.

    @param  string       The DOI
    @return dict         The corresponding CrossRef metadata
    """
    
    # find the metadata with the DOI
    research = cr.works(ids = DOI)
    
    # return the metadata
    return research['message']

In [57]:
metadata = metadata_retriever(cr, '10.1002/prot.20882') # DOIBoost DOI: '10.1007/s00422-017-0730-1'
print(metadata)

ConnectionError: HTTPSConnectionPool(host='api.crossref.org', port=443): Max retries exceeded with url: /works/10.1002/prot.20882 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7fdd50024860>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [None]:
def author_retriever(metadata):
    """
    Retrieves the author names from the metadata and convert it to a string
    
    @param  dict    The CrossRef metadata
    @return array   The list of authors
    """
    
    # get the authors
    authors = metadata.get('author', None)

    # stop if the metadata doesn't contain an author
    if not authors:
        return None
    
    # the resulting list
    result = []

    # loop over the authors
    for author in authors:
        
        # retrieve the first name
        name = author.get('given', None)
        
        # retrieve the surname
        surname = author.get('family', None)
        
        # don't add the name if the surname is absent
        if not surname:
            continue
        
        # add the first name before the surname if there is one
        if name:
            fullname = name + " " + surname
        else:
            fullname = surname
        
        # add the name to the list
        result.append(fullname)
    
    # return the list of author names
    return result

In [6]:
authors = author_retriever(metadata)
print(authors)

['Robert Lowe', 'Alexander Almér', 'Erik Billing', 'Yulia Sandamirskaya', 'Christian Balkenius']


In [7]:
es_host = "elasticsnarcis"
es_local = Elasticsearch([es_host])
searchindex = 'authors'
doctype = 'metadata'
es = es_local

In [8]:
# might be needed later
def is_before(this, that):
    this_year, this_month, this_day = this
    that_year, that_month, that_day = that
    
    this_date = datetime(this_year, this_month, this_day)
    that_date = datetime(that_year, that_month, that_day)
    
    return this_date < that_date

In [9]:
def retrieve_countries(authors, metadata):
    """
    Get the country of the authors
    
    @param  array   A list of author names
    @param  dict    The CrossRef metadata that contains the date of the research
    @return array   The list of countries
    """
    
    # resulting list
    result = []
    
    # loop over the authors
    for author in authors:
        
        # try to find the name in the author dataset
        try:
            res = es.search(index=searchindex, doc_type=doctype, body={"query": {"match": {'name': "%s" % author}}})
            hits = res['hits']['hits']
        except:
            continue
        
        # there are multiple hits
        if len(hits) > 1:
            
            # loop over the hits
            for hit in hits:
                
                # add the first hit and move on to the next author
                result.append(hit['_source']['country'])
                break
          
        # there is one hit, add it
        else: 
            result.append(hits['_source']['country'])
    
    # return the list of countries
    return result

In [10]:
countries = retrieve_countries(authors, metadata)
print(countries)

['Sweden', 'Sweden', 'Sweden', 'Switzerland', 'Sweden']


In [11]:
def classify(countries):
    """
    Classify the DOI as Dutch/Non Dutch
    
    @param  array     The countries of the affiliations the authors work for
    @return boolean   True for Dutch, False for Non Dutch
    """
    
    # one of the countries has to be 'Netherlands' for the DOI to be classified as Dutch
    if 'Netherlands' in countries:
        return True
    return False

In [12]:
classification = classify(countries)
print(classification)

False


In [56]:
res = es.search(index=searchindex, doc_type=doctype, body={'query':{}})
res

{'took': 3,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'failed': 0},
 'hits': {'total': 832097,
  'max_score': 1.0,
  'hits': [{'_index': 'authors',
    '_type': 'metadata',
    '_id': '116',
    '_score': 1.0,
    '_source': {'name': 'Xitao Fan',
     'country': 'Macao',
     'date': '2016-12-30'}},
   {'_index': 'authors',
    '_type': 'metadata',
    '_id': '119',
    '_score': 1.0,
    '_source': {'name': 'Michael Weinfeld',
     'country': 'Canada',
     'date': '2016-12-9'}},
   {'_index': 'authors',
    '_type': 'metadata',
    '_id': '120',
    '_score': 1.0,
    '_source': {'name': 'Alberto Dos Santos Pereira',
     'country': 'Canada',
     'date': '2016-12-9'}},
   {'_index': 'authors',
    '_type': 'metadata',
    '_id': '123',
    '_score': 1.0,
    '_source': {'name': 'Bojan Mohar',
     'country': 'Canada',
     'date': '2016-6-17'}},
   {'_index': 'authors',
    '_type': 'metadata',
    '_id': '126',
    '_score': 1.0,
    '_source': {'name': 'Jean-P