# Import block

In [1]:
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from collections import defaultdict
from habanero import Crossref, counts, cn
import datetime

# The DOIBoost pipeline (for the golden standard)

In [2]:
class DOIBoostPipeline():
    def __init__(self, dataset = 'doiboost2018'):
        
        # get the mongo client
        client = MongoClient('mongo')

        # get the database
        metadatadb = client.get_database('narcis')

        # get the metadata collection
        self.metacollection = metadatadb[dataset]
        
        # get the elasticsearch connection
        es_host = "elasticsnarcis"
        es_local = Elasticsearch([es_host])
        self.es = es_local
        
        # set the elasticsearch index and doctype
        self.searchindex = 'grid'
        self.doctype = 'metadata'
    
    
    def metadata_retriever(self, DOI):
        """
        Retrieves metadata when an DOI is provided.

        @param  Collection   The metadata collection
        @param  string       The DOI
        @return dict         The corresponding metadata
        """
    
        # find the metadata with the DOI
        subset = self.metacollection.find({'doi': DOI}).limit(1)

        # return the first item
        for item in subset:
            return item

        # there was no hit for the given DOI
        return None
    
    def grid_ids_retriever(self, metadata):
        """
        Uses metadata to retrieve the GRID ids

        @param  dict    The metadata of a paper
        @return array   The list of GRID ids    
        """

        # initial list
        gridIDs = []
        
        golden = True

        # get the author information
        authors = metadata['authors']

        # loop over the authors in the list
        for author in authors:

            # get the affiliation(s) of the author
            affiliations = author['affiliations']

            # only continue if there is information about the affiliation
            if affiliations:

                # loop over the affiliation information
                for affiliation in affiliations:

                    # get the identifiers
                    identifiers = affiliation['identifiers']

                    # if one author doesn't have a GRID id, we can't be sure about the result
                    if len(identifiers) < 2:
                        golden = False
                        continue

                    # get the value of the second item, which is always the GRID id
                    gridIDs.append(identifiers[1]['value'])

        # return the list of affiliations
        return gridIDs, golden

    def classify(self, gridIDs, golden):
        """
        Classify a research as Dutch/Non-Dutch based on the grid ids

        @param  array    The grid IDs of the authors
        @return boolean  True for Dutch, False for Non-Dutch
        """

        # we can only classify with 100% True Positives and True Negatives
        # if we have a complete list of GRID ids
        if not gridIDs:
            return None

        # Get GRID data
        for ID in gridIDs:
            res = self.es.search(index=self.searchindex, doc_type=self.doctype, 
                                 body={"query": {"match": {'ID': "%s" % ID }}})
            country = res['hits']['hits'][0]['_source']['Country']

            # Verify Dutch affiliation
            if country == 'Netherlands':
                return True
            
        # we are not sure if the 
        if not golden:
            return None

        return False
    
    def pipeline(self, DOIs):
        """
        The complete pipeline for identifying Dutch research using DOIBoost

        @param  Collection    The DOIBoost collection in MongoDB
        @param  array         The list of DOIs that need to be identified
        @return dict          A dictionary with {DOI: classification} pairs
        """

        # resulting dict
        result = {}

        # loop over the DOIs
        for DOI in DOIs:

            # retrieve the metadata
            metadata = self.metadata_retriever(DOI)

            # retrieve the GRID ids
            gridIDs, golden = self.grid_ids_retriever(metadata)

            # skip if we don't have any GRID ids and can't be sure about the DOI identification
            if gridIDs:

                # classify the DOI based on the GRID ids
                classification = self.classify(gridIDs, golden)

                # only add the classification if it's True or False
                if classification is None:
                    continue

                # add the classification to the resulting dict
                result[DOI] = classification

        # return the result
        return result
    
    def get_dois(self, size = 10000):
        """
        Get a list of DOIs for testing
        
        @param  int     The size of the list with DOIs
        @return array   The list of DOIs
        """
        
        # resulting list
        result = []
        
        # get the iterable collection of research 
        subset = self.metacollection.find({}).limit(size)

        # loop over the research in the collection
        for item in subset:
            
            # add the DOI of the research to the list
            result.append(item['doi'])
            
        # return the list of DOIs
        return result

# The CrossRef pipeline

In [3]:
class CrossRefPipeline():
    def __init__(self):
        
        # get the CrossRef API
        self.cr = Crossref()
        
        # get the elasticsearch connection
        es_host = "elasticsnarcis"
        es_local = Elasticsearch([es_host])
        self.es = es_local
        
        # set the elasticsearch index and doctype
        self.searchindex = 'authors'
        self.doctype = 'metadata'

        
    def metadata_retriever(self, DOI):
        """
        Retrieves CrossRef metadata when an DOI is provided.

        @param  string       The DOI
        @return dict         The corresponding CrossRef metadata
        """

        # find the metadata with the DOI
        research = self.cr.works(ids = DOI)

        # return the metadata
        return research['message']
    
    def author_retriever(self, metadata):
        """
        Retrieves the author names from the metadata and convert it to a string

        @param  dict    The CrossRef metadata
        @return array   The list of authors
        """

        # get the authors
        authors = metadata.get('author', None)

        # stop if the metadata doesn't contain an author
        if not authors:
            return None

        # the resulting list
        result = []

        # loop over the authors
        for author in authors:

            # retrieve the first name
            name = author.get('given', None)

            # retrieve the surname
            surname = author.get('family', None)

            # don't add the name if the surname is absent
            if not surname:
                continue

            # add the first name before the surname if there is one
            if name:
                fullname = name + " " + surname
            else:
                fullname = surname

            # add the name to the list
            result.append(fullname)

        # return the list of author names
        return result
    
    def retrieve_countries(self, authors, metadata):
        """
        Get the country of the authors

        @param  array   A list of author names
        @param  dict    The CrossRef metadata that contains the date of the research
        @return array   The list of countries
        """

        # resulting list
        result = []

        # loop over the authors
        for author in authors:

            # try to find the name in the author dataset
            try:
                res = es.search(index=self.searchindex, doc_type=self.doctype, 
                                body={"query": {"match": {'name': "%s" % author}}})
                hits = res['hits']['hits']
            except:
                continue

            # there are multiple hits
            if len(hits) > 1:

                # loop over the hits
                for hit in hits:

                    # check if the name matches
#                     if author == hit['_source']['name']:
                        
                        # add the first hit and move on to the next author
                    result.append(hit['_source']['country'])
                    break

            # there is one hit
            else: 
                
                # check if the name matches
#                 if author == hit['_source']['name']:
                    
                    # add the hit
                result.append(hits['_source']['country'])

        # return the list of countries
        return result
    
    def classify(self, countries):
        """
        Classify the DOI as Dutch/Non Dutch

        @param  array     The countries of the affiliations the authors work for
        @return boolean   True for Dutch, False for Non Dutch
        """

        # one of the countries has to be 'Netherlands' for the DOI to be classified as Dutch
        if 'Netherlands' in countries:
            return True
        return False
    
    def pipeline(self, DOIs):
        """
        The complete pipeline for identifying Dutch research using DOIBoost

        @param  Collection    The DOIBoost collection in MongoDB
        @param  array         The list of DOIs that need to be identified
        @return dict          A dictionary with {DOI: classification} pairs
        """

        # resulting dict
        result = {}

        # loop over the DOIs
        for DOI in DOIs:

            # retrieve the metadata
            metadata = self.metadata_retriever(DOI)
            
            # retrieve the authors from the metadata
            authors = self.author_retriever(metadata)

            # we need to have authors to continue
            if authors:

                # retrieve the countries of the authors
                countries = self.retrieve_countries(authors, metadata)
                
                # classify the DOI based on the GRID ids
                classification = self.classify(countries)

                # add the classification to the resulting dict
                result[DOI] = classification

        # return the result
        return result

# The evaluation function

In [4]:
def evaluation(classifications, golden):
    """
    Evaluate the classifier
    
    @param  dict   The labels from the classifier
    @param  dict   The golden standard labels
    @return dict   The True Positives, False Positives, True Negatives and False Negatives
    """
    
    # the resulting dict
    result = defaultdict(int)
    
    # loop over the DOIs
    for DOI in classifications:
        
        # get the label
        classification = classifications[DOI]
        
        # get the golden label
        gold = golden.get(DOI, None)
        
        # unknown if there is no golden label for the DOI
        if (gold is None):
            result['Unknown'] += 1
        
        # the label should be Positive
        elif gold == True:
            
            # True Positive
            if classification == True:
                result['TP'] += 1
            
            # False Negative
            else: 
                result['FN'] += 1
        
        # the label should be Negative
        else:
            
            # True Negative
            if classification == False:
                result['TN'] += 1
            
            # False Positive
            else: 
                result['FP'] += 1
       
    # return the result
    return result

# Lets see the model in action

In [5]:
D = DOIBoostPipeline('doiboost2018')
C = CrossRefPipeline()

In [20]:
DOIs = D.get_dois(100)

In [21]:
doiboost_labels = D.pipeline(DOIs)

In [22]:
crossref_labels = C.pipeline(doiboost_labels.keys())

ConnectionError: HTTPSConnectionPool(host='api.crossref.org', port=443): Max retries exceeded with url: /works/10.1016/j.bbrc.2018.02.001 (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f7167b11588>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))

In [None]:
evalutation_results = evaluation(crossref_labels, doiboost_labels)
print(evalutation_results)

In [None]:
evalutation_results = evaluation(doiboost_labels, doiboost_labels)
print(evalutation_results)