In [13]:
# import re
import json
from pymongo import MongoClient
from elasticsearch import Elasticsearch
from collections import defaultdict
import os
# from habanero import Crossref, counts, cn
# import pandas as pd
# import matplotlib.pyplot as plt
# from matplotlib.ticker import FuncFormatter

In [2]:
# get the mongo client
client = MongoClient('mongo')

# get the database
metadatadb = client.get_database('narcis')

# get the metadata collection
metacollection = metadatadb.doiboost2017

In [3]:
es_host = "elasticsnarcis"
es_local = Elasticsearch([es_host])
searchindex = 'grid'
doctype = 'metadata'
es = es_local

In [27]:
def dict_to_json(data, filename):

    # append the data to the file
    with open(filename, 'a') as fp:
        
        # create json code of the dict
        json_data = json.dumps(data)
        
        # write the json to the file and add a new line
        fp.write(json_data + "\n")

In [29]:
def create_author_country_data(metacollection, filename):
    """
    Create a dataset 
    
    @param  dict    The metadata of a paper
    @return array   The list of GRID ids    
    """
    
    # make sure that the file exists
    file = open(filename, "w+")
    file.close()
    
    # get the complete collection as an iterable
    subset = metacollection.find()
    
    # loop over the metadata entries
    for metadata in subset:
    
        # get the date of the paper
        date = metadata['issued']
        
        # get the author information
        authors = metadata['authors']

        # loop over the authors in the list
        for author in authors:
            
            # get the full name
            name = author['fullname']

            # get the affiliation(s) of the author
            affiliations = author['affiliations']

            # only continue if there is information about the affiliation
            if affiliations:

                # loop over the affiliation information
                for affiliation in affiliations:

                    # get the identifiers
                    identifiers = affiliation['identifiers']
                        
                    # the author needs to have an GRID id
                    if len(identifiers) < 2:
                        continue
                        
                    # get the value of the second item, which is always the GRID id
                    gridID = identifiers[1]['value']
                    
                    # retrieve the counrty of the authors affiliation in the GRID data
                    res = es.search(index=searchindex, doc_type=doctype, 
                                    body={"query": {"match": {'ID': "%s" % gridID }}})
                    country = res['hits']['hits'][0]['_source']['Country']
                    
                    # add the country and date to the enrty of the author
                    dict_to_json({'name': name, 'country': country, 'date': date}, filename)

In [33]:
create_author_country_data(metacollection, "authors.json")

In [28]:
for i in range(10):
    dict_to_json({"test": i}, "test.json")