## Import the needed libraries

In [2]:
# import re
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from collections import defaultdict
# from habanero import Crossref, counts, cn
# import pandas as pd
# import matplotlib.pyplot as plt
# from matplotlib.ticker import FuncFormatter

## Get the mongo collection

In [6]:
# get the mongo client
client = MongoClient('mongo')

# get the database
metadatadb = client.get_database('narcis')

# get the metadata collection
metacollection = metadatadb.doiboost2018

## The metadata retriever

In [7]:
def metadata_retriever(collection, DOI):
    """
    Retrieves metadata when an DOI is provided.

    @param  Collection   The metadata collection
    @param  string       The DOI
    @return dict         The corresponding metadata
    """
    
    # find the metadata with the DOI
    subset = collection.find({'doi': DOI}).limit(1)
    
    # return the first item
    for item in subset:
        return item
    
    # there was no hit for the given DOI
    return None
    

## Get the metadata that belongs to the DOI

In [8]:
metadata = metadata_retriever(metacollection, '10.1002/pds.4360') #'10.1007/s00422-017-0730-1') # 10.1002/ejp.1078
print(metadata.keys())

dict_keys(['_id', 'publisher', 'issn', 'doi', 'license', 'published-print', 'title', 'issued', 'abstract', 'doi-url', 'instances', 'authors', 'collectedFrom', 'accepted', 'type', 'published-online', 'subject'])


In [9]:
metadata['authors']

[{'affiliations': [{'official-page': None,
    'provenance': 'CrossRef',
    'identifiers': [],
    'value': 'Department of Epidemiology; University of North Carolina; Chapel Hill NC USA'},
   {'official-page': 'http://www.unc.edu/',
    'provenance': 'MAG',
    'identifiers': [{'value': 'http://en.wikipedia.org/wiki/University_of_North_Carolina_at_Chapel_Hill',
      'schema': 'wikpedia'},
     {'value': 'grid.10698.36', 'schema': 'grid.ac'}],
    'value': 'University of North Carolina at Chapel Hill'}],
  'given': 'Anne M.',
  'identifiers': [{'provenance': 'MAG',
    'value': 'https://academic.microsoft.com/#/detail/2165510615',
    'schema': 'URL'}],
  'fullname': 'Anne M. Butler',
  'family': 'Butler'},
 {'affiliations': [{'official-page': None,
    'provenance': 'CrossRef',
    'identifiers': [],
    'value': 'Amgen, Inc., Thousand Oaks and South San Francisco; CA USA'},
   {'official-page': 'http://www.amgen.com/',
    'provenance': 'MAG',
    'identifiers': [{'value': 'http://e

In [None]:
for i in range(len(metadata['authors'])):
    print(metadata['authors'][i]['affiliations'])

## The GRID id retriever

In [None]:
def grid_ids_retriever(metadata):
    """
    Uses metadata to retrieve the GRID ids
    
    @param  dict    The metadata of a paper
    @return array   The list of GRID ids    
    """
    
    # initial list
    gridIDs = []
    
    # get the author information
    authors = metadata['authors']
    
    # loop over the authors in the list
    for author in authors:
        
        # get the affiliation(s) of the author
        affiliations = author['affiliations']
        
        # only continue if there is information about the affiliation
        if affiliations:
            
            # loop over the affiliation information
            for affiliation in affiliations:
                
                # get the identifiers
                identifiers = affiliation['identifiers']
                
                # if one author doesn't have a GRID id, we can't be sure about the result
                if len(identifiers) < 2:
                    return []
                    
                # get the value of the second item, which is always the GRID id
                gridIDs.append(identifiers[1]['value'])
                
    # return the list of affiliations
    return gridIDs

## Get the GRID ids that belong to the metadata

In [None]:
gridIDs = grid_ids_retriever(metadata)
print(gridIDs)

## Get the GRID index of Elasticsearch

In [None]:
es_host = "elasticsnarcis"
es_local = Elasticsearch([es_host])
searchindex = 'grid'
doctype = 'metadata'
es = es_local

## The classifier

In [None]:
def classify(gridIDs):
    """
    Classify a research as Dutch/Non-Dutch based on the grid ids
    
    @param  array    The grid IDs of the authors
    @return boolean  True for Dutch, False for Non-Dutch
    """
    
    # we can only classify with 100% True Positives and True Negatives
    # if we have a complete list of GRID ids
    if not gridIDs:
        return None
    
    # Get GRID data
    for ID in gridIDs:
        res = es.search(index=searchindex, doc_type=doctype, body={"query": {"match": {'ID': "%s" % ID }}})
        country = res['hits']['hits'][0]['_source']['Country']
        
        # Verify Dutch affiliation
        if country == 'Netherlands':
            return True
        
    return False

In [None]:
classification = classify(gridIDs)
print(classification)

## The complete pipeline from DOI to Dutch/Non-Dutch

In [None]:
def DOIBoost_pipeline(collection, DOIs):
    """
    The complete pipeline for identifying Dutch research using DOIBoost
    
    @param  Collection    The DOIBoost collection in MongoDB
    @param  array         The list of DOIs that need to be identified
    @return dict          A dictionary with {DOI: classification} pairs
    """
    
    # resulting dict
    result = {}
    
    # loop over the DOIs
    for DOI in DOIs:
        
        # retrieve the metadata
        metadata = metadata_retriever(collection, DOI)
        
        # retrieve the GRID ids
        gridIDs = grid_ids_retriever(metadata)
        
        # skip if we don't have any GRID ids and can't be sure about the DOI identification
        if gridIDs:
            
            # classify the DOI based on the GRID ids
            classification = classify(gridIDs)
            
            # only add the classification if it's True or False
            if classification is None:
                continue
            
            # add the classification to the resulting dict
            result[DOI] = classification
    
    # return the result
    return result

In [None]:
classifications = DOIBoost_pipeline(metacollection, ['10.1007/s00422-017-0730-1'])
print(classifications)

## The evaluation 

In [None]:
def evaluation(classifications, golden):
    """
    Evaluate the classifier
    
    @param  dict   The labels from the classifier
    @param  dict   The golden standard labels
    @return dict   The True Positives, False Positives, True Negatives and False Negatives
    """
    
    # the resulting dict
    result = defaultdict(int)
    
    # loop over the DOIs
    for DOI in classifications:
        
        # get the label
        classification = classifications[DOI]
        
        # get the golden label
        gold = golden.get(DOI, None)
        
        # unknown if there is no golden label for the DOI
        if gold is None:
            result['Unknown'] += 1
        
        # the label should be Positive
        elif gold == True:
            
            # True Positive
            if classification == True:
                result['TP'] += 1
            
            # False Negative
            else: 
                result['FN'] += 1
        
        # the label should be Negative
        else:
            
            # True Negative
            if classification == False:
                result['TN'] += 1
            
            # False Positive
            else: 
                result['FP'] += 1
       
    # return the result
    return result

In [None]:
# this will always return true positives and true negatives
evaluations = evaluation(classifications, classifications)
print(evaluations)