In [1]:
from rc_util import * 
from richcontext import scholapi as rc_scholapi # pip install richcontext.scholapi

import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from rcgraph.richcontext import graph as rc_graph

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# build a graph from the JSON-LD corpus
net = RCNetwork()

# parses, builds NetworkX graph, and creates default "rank" for each entity
net.load_network("full.jsonld") # net.load_network("../../rclc/corpus.jsonld")

25123.997926712036

In [7]:
# profile corpus
num_datasets = len(set(net.data.keys()))

num_publications = len(set(net.publ.keys()))

num_providers = len(set(net.prov.keys()))

num_authors = len(set(net.auth.keys()))

print("this corpus has: \n--{:,.0f} datasets \n--{:,.0f} publications \n--{:,.0f} providers \n--{:,.0f} authors".format(
num_datasets, num_publications, num_providers, num_authors))

this corpus has: 
--619 datasets 
--3,816 publications 
--378 providers 
--8,249 authors


In [8]:
## dataset RC-IDs for 4 target datasets: FoodAPS, ARMS, IRI InfoScan & Consumer Network
data_list = ['dataset-955eb4bf66b73016354c', # ARMS
             'dataset-fc71e81f1f2c4130d897', # FoodAPS
            'dataset-ae01c2bf3451493f3620',  # IRI Consumer Network
             'dataset-cb23c2370049f4960a3a'] # IRI InfoScan

In [5]:
# simple count of publications for each dataset:
for rc_id in data_list:
    # gather  info from 'recommender'; it sorts the publications based on Eigenvector calc of 'rank'
    uuid, title, rank, url, provider, publ_list = net.reco_data(net.data[rc_id])
    print('{} | {} |  has {} publications'.format(rc_id, title,len(publ_list)))

dataset-955eb4bf66b73016354c | Agricultural Resource Management Survey |  has 320 publications
dataset-fc71e81f1f2c4130d897 | FoodAPS National Household Food Acquisition and Purchase Survey |  has 35 publications
dataset-ae01c2bf3451493f3620 | IRI Consumer Network |  has 27 publications
dataset-cb23c2370049f4960a3a | IRI Infoscan |  has 88 publications


1. dataset name
2. number of publications
3. number of combined datasets
4. top 5 datasets and their providers
5. number of authors
6. top 5 authors

### test steps for one dataset

In [6]:
# build on existing function to generate metrics
uuid, title, rank, url, provider, publ_list = net.reco_data(net.data['dataset-ae01c2bf3451493f3620'])

In [3]:
def extract_institution(orcid,institutions_dict, institutions_count):
    hit = False

    # get orcid institution
    if len(net.auth[a].view["orcid"]) > 19:
        orcid = net.auth[a].view["orcid"][-19:]

    result = schol.orcid.affiliations(orcid)
    result2 = schol.orcid.funding(orcid)
    if result.meta:
        hit = True
        if isinstance(result.meta, list):
            institution = result.meta[0]['employment:organization']
        else:
            institution = result.meta['employment:organization']
        #print(orcid, institution['common:name'])

        if institution['common:name'] in institutions_dict.keys():
            institutions_count[institution['common:name']] += 1
        else:
            institutions_dict[institution['common:name']] = institution  # TODO potentially show countries?
            institutions_count[institution['common:name']] = 1
    if result2.meta:
        #print("TODO, look at schol.orcid.funding(orcid) for", orcid)
        funding.append(result2.meta)
    return hit

In [4]:
# load all KG known authors
graph = rc_graph.RCGraph()
graph.authors.load_entities("../rcgraph/authors.json") 

# load all Dimensions raw metadata from publications
dimension_pubs = {}
for partition, pub_iter in graph.iter_publications("../rcgraph/bucket_stage"):
    for pub in pub_iter:
        if "Dimensions" in pub:
            if "authors" in pub["Dimensions"]:
                dimension_pubs[pub["title"]] = pub["Dimensions"]["authors"]

In [5]:

def extract_institution_from_dimensions(pub,author_uuid):
    if author_uuid in graph.authors.known.uuid_map:
        if "dimensions" in graph.authors.known.uuid_map[author_uuid]:
            researcher_id = graph.authors.known.uuid_map[author_uuid]["dimensions"]
            if pub in dimension_pubs:
                for dimensions_meta in dimension_pubs[pub]:
                    if dimensions_meta["researcher_id"] == researcher_id:
                        try:
                            institution_name = dimensions_meta["affiliations"][0]["name"]

                            if institution_name in institutions_dict.keys():
                                institutions_count[institution_name] += 1
                            else:
                                institutions_dict[institution_name] = dimensions_meta["affiliations"][0]  # TODO potentially show countries?
                                institutions_count[institution_name] = 1
                        except:
                            pass
                        return True

    return False


In [11]:
funding = []


print(net.data['dataset-ae01c2bf3451493f3620'].view['title'])
print(len(publ_list))
joined_datasets = {}
joined_data_counts = {}
authors_dict = {}
authors_count = {}
institutions_dict = {}
institutions_count = {}
authorsWithInstitution_count = 0
schol = rc_scholapi.ScholInfraAPI(config_file="rc.cfg", logger=None)

for pubid, pub, pubrank in publ_list:
    d_list = net.publ[net.id_list[pubid]].view['datasets'].copy()
    d_list.remove(uuid) # don't include current dataset
    for d in d_list:
        if d in joined_datasets.keys():
            joined_data_counts[d] += 1
        else:
            joined_datasets[d] = net.data[d].view
            joined_data_counts[d] = 1
            joined_datasets[d]['ProvName'] = net.prov[joined_datasets[d]['provider']].view['title']
    a_list = net.publ[net.id_list[pubid]].view['authors'].copy()
    for a in a_list:
        if a in authors_dict.keys():
            authors_count[a] += 1
        else:
            authors_dict[a] = net.auth[a].view
            authors_count[a] = 1
            
            # for each dataset, count the institution only once per author. If the author wrote several publications, it will account for only one institution count.
            if net.auth[a].view["orcid"]:
                hit = extract_institution(net.auth[a].view["orcid"], institutions_dict, institutions_count)
                if hit:
                    authorsWithInstitution_count += 1

IRI Consumer Network
88


In [None]:
print(joined_datasets.keys())

In [None]:
joined_datasets['dataset-17fbd0c3d561e8260ab3']

In [None]:
# to get top 5 authors
sorted(authors_count.items(), key=lambda t: t[1], reverse=-True)[:5]

In [None]:
sorted(joined_data_counts.items(), key=lambda t: t[1], reverse=-True)[:5]

In [None]:
for data_id, data_count in sorted(joined_data_counts.items(), key=lambda t: t[1], reverse=-True)[:5]:
    DataName = joined_datasets[data_id]['title']
    DataProv = net.prov[joined_datasets[data_id]['provider']].view['title']
    print('Dataset {} by {} joined {} times'.format(DataName,DataProv,data_count))

In [None]:
for auth_id, auth_count in sorted(authors_count.items(), key=lambda t: t[1], reverse=-True)[:5]:
    AuthName = authors_dict[auth_id]['title']
    AuthORCID = authors_dict[auth_id]['orcid']
    print('{} | {} | used dataset {} times'.format(AuthName, AuthORCID, auth_count))

### work through specified list of datasets
metrics to generate for each:
- number of publications
- number of combined datasets
- top 5 datasets and their providers
- number of authors
- top 5 authors

In [9]:
funding = []
schol = rc_scholapi.ScholInfraAPI(config_file="rc.cfg", logger=None) # to access ORCID

dataset_metrics = {} # object to hold resulting metrics calculated below

for this_data in data_list:
    # build on existing function to generate metrics
    uuid, title, rank, url, provider, publ_list = net.reco_data(net.data[this_data])
    print('collecting measures for {}'.format(title))
    num_pubs = len(publ_list)
    print('-- used in {} publications'.format(num_pubs))
    joined_datasets = {}
    joined_data_counts = {}
    authors_dict = {}
    authors_count = {}
    institutions_dict = {}
    institutions_count = {}
    authorsWithInstitution_count = 0

    for pubid, pub, pubrank in publ_list:
        # get datasets used in publication
        d_list = net.publ[net.id_list[pubid]].view['datasets'].copy()
        d_list.remove(uuid) # don't include current dataset
        for d in d_list:
            if d in joined_datasets.keys():
                joined_data_counts[d] += 1
            else:
                joined_datasets[d] = net.data[d].view
                joined_data_counts[d] = 1
                joined_datasets[d]['ProvName'] = net.prov[joined_datasets[d]['provider']].view['title']
        a_list = net.publ[net.id_list[pubid]].view['authors'].copy()
        for a in a_list:
            if a in authors_dict.keys():
                authors_count[a] += 1
            else:
                authors_dict[a] = net.auth[a].view
                authors_count[a] = 1
                
                # extract institution first trying with author's ORCID (new pull) and then with Dimensions (existing metadata in RCGraph)
                hit = False
                if net.auth[a].view["orcid"]:
                    hit = extract_institution(net.auth[a].view["orcid"], institutions_dict, institutions_count)
                else:
                    hit = extract_institution_from_dimensions(pub,a)
                
                if hit:
                    authorsWithInstitution_count += 1

    # add results to dataset_metrics dict:
    dataset_metrics[uuid] = {'combinedDatasets': joined_datasets, 'combinedDataCounts': joined_data_counts,
                            'PubAuthors': authors_dict, 'AuthorCounts': authors_count,
                            'TotalPublications': num_pubs, 'PubIDs': publ_list,
                             'AuthorInstitutions':institutions_dict, 'InstitutionCounts':institutions_count,
                             'AuthorsWithInstitutionCounts':authorsWithInstitution_count}
    
    ## print selected gathered info:
    # total datasets
    print('-- {} datasets were combined with this dataset. The top 5 are:'.format(len(joined_datasets.keys())))
    # top 5 datasets
    for data_id, data_count in sorted(joined_data_counts.items(), key=lambda t: t[1], reverse=-True)[:5]:
        DataName = joined_datasets[data_id]['title']
        DataProv = joined_datasets[data_id]['ProvName']
        print('---- {} by {} joined {} times'.format(DataName,DataProv,data_count))
    # total authors:
    print('-- {} authors used this dataset. The top 5 are:'.format(len(authors_dict.keys())))
    # top 5 authors:
    for auth_id, auth_count in sorted(authors_count.items(), key=lambda t: t[1], reverse=-True)[:5]:
        AuthName = authors_dict[auth_id]['title']
        AuthORCID = authors_dict[auth_id]['orcid']
        if AuthORCID=='':
            AuthORCID = 'unknown'
        print('---- {} | ORCID = {} | used dataset {} times'.format(AuthName, AuthORCID, auth_count))
    # total institutions:
    print('-- {} institutions used this dataset. The top 5 are:'.format(len(institutions_dict.keys())))
    # top 5 institutions:
    for institution_name , institution_count in sorted(institutions_count.items(), key=lambda t: t[1], reverse=-True)[:5]:
        InstitutionName = institution_name

        print('---- {} | used dataset {} times'.format(InstitutionName, institution_count))

    print('------ Note: of the {} authors, we were able to find institutions for {} of them'.format(
        len(authors_dict.keys()), authorsWithInstitution_count))
    print('') # add line between each datast summary


print("TODO: review funding found for ",len(funding),"authors")

collecting measures for Agricultural Resource Management Survey
-- used in 320 publications
-- 72 datasets were combined with this dataset. The top 5 are:
---- Census of Agriculture by US Department of Agriculture joined 25 times
---- Soil Survey Geographic Database by US Department of Agriculture joined 5 times
---- Cropland Data Layer by US Department of Agriculture joined 5 times
---- Survey of Consumer Finance by Federal Reserve System joined 5 times
---- USDA Fertilizer Use and Price by US Department of Agriculture joined 4 times
-- 545 authors used this dataset. The top 5 are:
---- Mishra, Ashok K. | ORCID = https://orcid.org/0000-0002-0988-1428 | used dataset 44 times
---- Key, Nigel D. | ORCID = https://orcid.org/0000-0002-0290-8608 | used dataset 15 times
---- Gillespie, Jeffrey M. | ORCID = unknown | used dataset 15 times
---- El-Osta, Hisham Said | ORCID = unknown | used dataset 15 times
---- Nehring, Richard F. | ORCID = unknown | used dataset 14 times
-- 118 institutions u

In [10]:
dataset_metrics.keys()

dict_keys(['dataset-955eb4bf66b73016354c', 'dataset-fc71e81f1f2c4130d897', 'dataset-ae01c2bf3451493f3620', 'dataset-cb23c2370049f4960a3a'])

In [12]:
# can access all results if need to report out differently, eg number of datasets:
len(dataset_metrics['dataset-955eb4bf66b73016354c']['combinedDatasets'].keys())

72