## Import the needed libraries

In [1]:
# import re
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from collections import defaultdict
# from habanero import Crossref, counts, cn
# import pandas as pd
# import matplotlib.pyplot as plt
# from matplotlib.ticker import FuncFormatter

## Get the mongo collection

In [2]:
# get the mongo client
client = MongoClient('mongo')

# get the database
metadatadb = client.get_database('narcis')

# get the metadata collection
metacollection = metadatadb.doiboost2017

## The metadata retriever

In [3]:
def metadata_retriever(collection, DOI):
    """
    Retrieves metadata when an DOI is provided.

    @param  Collection   The metadata collection
    @param  string       The DOI
    @return dict         The corresponding metadata
    """
    
    # find the metadata with the DOI
    subset = collection.find({'doi': DOI}).limit(1)
    
    # return the first item
    for item in subset:
        return item
    
    # there was no hit for the given DOI
    return None
    

## Get the metadata that belongs to the DOI

In [4]:
metadata = metadata_retriever(metadatadb.doiboost2017, '10.1007/s00422-017-0730-1')
print(metadata)

{'_id': ObjectId('5d01137ba1ae159641df3c08'), 'publisher': None, 'issn': [{'type': 'print', 'value': '0340-1200'}, {'type': 'electronic', 'value': '1432-0770'}], 'doi': '10.1007/s00422-017-0730-1', 'license': [{'url': 'http://creativecommons.org/licenses/by/4.0', 'content-version': 'unspecified', '"delay-in-days': None, 'date-time': '2017-09-14T00:00:00Z'}], 'published-print': '2017-12-1', 'title': ['Affective–associative two-process theory: a neurocomputational account of partial reinforcement extinction effects'], 'issued': '2017-9-14', 'abstract': [{'provenance': 'MAG', 'value': 'The partial reinforcement extinction effect (PREE) is an experimentally established phenomenon: behavioural response to a given stimulus is more persistent when previously inconsistently rewarded than when consistently rewarded. This phenomenon is, however, controversial in animal/human learning theory. Contradictory findings exist regarding when the PREE occurs. One body of research has found a within-subj

## The GRID id retriever

In [5]:
def grid_ids_retriever(metadata):
    """
    Uses metadata to retrieve the GRID ids
    
    @param  dict    The metadata of a paper
    @return array   The list of GRID ids    
    """
    
    # initial list
    gridIDs = []
    
    # get the author information
    authors = metadata['authors']
    
    # loop over the authors in the list
    for author in authors:
        
        # get the affiliation(s) of the author
        affiliations = author['affiliations']
        
        # only continue if there is information about the affiliation
        if affiliations:
            
            # loop over the affiliation information
            for affiliation in affiliations:
                
                # get the identifiers
                identifiers = affiliation['identifiers']
                
                # get the value of the second item, which is always the GRID id
                gridIDs.append(identifiers[1]['value'])
                
    # return the list of affiliations
    return gridIDs

## Get the GRID ids that belong to the metadata

In [6]:
gridIDs = grid_ids_retriever(metadata)
print(gridIDs)

['grid.8761.8', 'grid.8761.8', 'grid.412798.1', 'grid.5801.c', 'grid.4514.4']


## Get the GRID index of Elasticsearch

In [7]:
es_host = "elasticsnarcis"
es_local = Elasticsearch([es_host])
searchindex = 'grid'
doctype = 'metadata'
es = es_local

## The classifier

In [8]:
def classify(gridIDs):
    """
    Classify a research as Dutch/Non-Dutch based on the grid ids
    
    @param  array    The grid IDs of the authors
    @return boolean  True for Dutch, False for Non-Dutch
    """
    
    # Get GRID data
    for ID in gridIDs:
        res = es.search(index=searchindex, doc_type=doctype, body={"query": {"match": {'ID': "%s" % ID }}})
        country = res['hits']['hits'][0]['_source']['Country']
        
        # Verify Dutch affiliation
        if country == 'Netherlands':
            return True
        
    return False

In [9]:
classification = classify(gridIDs)
print(classification)

False


## The complete pipeline from DOI to Dutch/Non-Dutch

In [10]:
def DOIBoost_pipeline(collection, DOIs):
    """
    The complete pipeline for identifying Dutch research using DOIBoost
    @param  Collection    The DOIBoost collection in MongoDB
    @param  array         The list of DOIs that need to be identified
    @return dict          A dictionary with {DOI: classification} pairs
    """
    
    # resulting dict
    result = {}
    
    # loop over the DOIs
    for DOI in DOIs:
        
        # retrieve the metadata
        metadata = metadata_retriever(collection, DOI)
        
        # retrieve the GRID ids
        gridIDs = grid_ids_retriever(metadata)
        
        # skip if we don't have any GRID ids and can't be sure about the DOI identification
        if gridIDs:
            
            # classify the DOI based on the GRID ids
            classification = classify(gridIDs)
            
            # add the classification to the resulting dict
            result[DOI] = classification
    
    # return the result
    return result

In [11]:
classifications = DOIBoost_pipeline(metacollection, ['10.1007/s00422-017-0730-1'])
print(classifications)

{'10.1007/s00422-017-0730-1': False}


## The evaluation 

In [12]:
def evaluation(classifications, golden):
    """
    Evaluate the classifier
    
    @param  dict   The labels from the classifier
    @param  dict   The golden standard labels
    @return dict   The True Positives, False Positives, True Negatives and False Negatives
    """
    
    
    result = defaultdict(int)
    
    for DOI in classifications:
        classification = classifications[DOI]
        gold = golden[DOI]
        
        if gold == True:
            
            if classification == True:
                result['TP'] += 1
            else: 
                result['FN'] += 1
        else:
            
            if classification == False:
                result['TN'] += 1
            else: 
                result['FP'] += 1
                
    return result

In [13]:
# this will always return true positives and true negatives
evaluations = evaluation(classifications, classifications)
print(evaluations)

defaultdict(<class 'int'>, {'TN': 1})
