## Export manually curated linkages from USDA excel spreadsheet sent on Sept 

#### Read in manually curated linkages and map to dataset id

Import curated linkages with dataset_ids

In [1]:
import pandas as pd

In [2]:
ers_linkages_path =  '/Users/sophierand/RichContextMetadata/metadata/20190913_usda_excel/producing_metadata/usda_linkages.csv'
usda_linkages = pd.read_csv(ers_linkages_path)

Limit to titles with dataset linkages

In [3]:
import numpy as np

In [4]:
usda_linkages = usda_linkages[['title','pub_url','dataset_id']].drop_duplicates()
usda_linkages_lim = usda_linkages.loc[np.logical_and(pd.notnull(usda_linkages.dataset_id), usda_linkages.dataset_id != "0")]

Import `datasets.json`

In [5]:
import json

In [6]:
datasets_path = '/Users/sophierand/RCDatasets/datasets.json'

with open(datasets_path) as json_file:
    datasets = json.load(json_file)

Map dataset_ids in linkage file to `datasets.json`

In [7]:
import unicodedata

In [8]:
def scrub_unicode (text):
    """
    try to handle the unicode edge cases encountered in source text,
    as best as possible
    """
    x = " ".join(map(lambda s: s.strip(), text.split("\n"))).strip()

    x = x.replace('“', '"').replace('”', '"')
    x = x.replace("‘", "'").replace("’", "'").replace("`", "'")
    x = x.replace("`` ", '"').replace("''", '"')
    x = x.replace('…', '...').replace("\\u2026", "...")
    x = x.replace("\\u00ae", "").replace("\\u2122", "")
    x = x.replace("\\u00a0", " ").replace("\\u2022", "*").replace("\\u00b7", "*")
    x = x.replace("\\u2018", "'").replace("\\u2019", "'").replace("\\u201a", "'")
    x = x.replace("\\u201c", '"').replace("\\u201d", '"')

    x = x.replace("\\u20ac", "€")
    x = x.replace("\\u2212", " - ") # minus sign

    x = x.replace("\\u00e9", "é")
    x = x.replace("\\u017c", "ż").replace("\\u015b", "ś").replace("\\u0142", "ł")    
    x = x.replace("\\u0105", "ą").replace("\\u0119", "ę").replace("\\u017a", "ź").replace("\\u00f3", "ó")

    x = x.replace("\\u2014", " - ").replace('–', '-').replace('—', ' - ')
    x = x.replace("\\u2013", " - ").replace("\\u00ad", " - ")

    x = str(unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8"))

    # some content returns text in bytes rather than as a str ?
    try:
        assert type(x).__name__ == "str"
    except AssertionError:
        print("not a string?", type(x), x)

    return x

In [9]:
usda_linkage_list = []
for i,r in usda_linkages_lim.iterrows():
    dataset_id = r['dataset_id']
    pub_dict = {'title':scrub_unicode(r['title']),'url':r['pub_url']}
    if "," in dataset_id:
        dataset_id_list= dataset_id.split(",")
        dataset_id_list = [d.strip() for d in dataset_id_list]
        ds_list = []
        for d in dataset_id_list:
            ds_metadata = [{'dataset_id':b['id'],'dataset_name':b['title']} for b in datasets if b['id'] == d][0]
            ds_list.append(ds_metadata)
    if not "," in dataset_id:
        ds_list = [{'dataset_id':b['id'],'dataset_name':b['title']} for b in datasets if b['id'] == dataset_id][0]
    pub_dict.update({'related_dataset':ds_list})
    usda_linkage_list.append(pub_dict)
    

### Fetch publication information from Dimensions, where available

Import dimensions helpers

In [10]:
import dimensions_search_api_client as dscli

Connect to API

In [11]:
def connect_ds_api(username,password):
    api_client = dscli.DimensionsSearchAPIClient()
    api_client.set_max_in_items( 100 )
    api_client.set_max_return( 1000 )
    api_client.set_max_overall_returns( 50000 )
    api_client.set_username( username )
    api_client.set_password( password )
    return api_client

In [12]:
import configparser
CONFIG = configparser.ConfigParser()
CONFIG.read("dimensions.cfg")

['dimensions.cfg']

In [13]:
api_client = connect_ds_api(username= CONFIG.get('DEFAULT','username'),password = CONFIG.get('DEFAULT','password'))

API credentials have been set


Define dimensions searches

In [14]:
def run_exact_string_search(string,api_client):
    search_string = 'search publications in title_only for "\\"{}\\"" return publications'.format(string)
    api_response = api_client.execute_query(query_string_IN = search_string )
    return api_response

def search_title(title):
    title =  title.replace('"','\\"')
    dimensions_return = run_exact_string_search(string = title,api_client = api_client)
    try:
        title_return = dimensions_return['publications']
        if len(title_return) > 0:
            return title_return
        else:
            print('nothing was returned')
            return None
    except:
        print('error with title {}'.format(title))
        
def run_pub_id_search(dimensions_id,api_client):
    id_search_string = 'search publications where id = "{}" return publications[all] limit 1'.format(dimensions_id)
    id_response = api_client.execute_query( query_string_IN=id_search_string )
    publication_metadata = id_response['publications'][0]
    return publication_metadata

Search for all titles

In [15]:
import time

In [27]:
dimensions_results = []
for u in usda_linkage_list:
    time.sleep( 6 )
    dim_result = search_title(u['title'])
    if dim_result:
        dimensions_id = dim_result[0]['id']
        pub_search = run_pub_id_search(dimensions_id = dimensions_id,api_client = api_client)
#         pub_doi = pub_search['doi']
#         u.update({'doi':pub_doi})
        u.update(pub_search)
        dimensions_results.append(u)

nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was returned
nothing was r

In [30]:
# 
# dimensions_results


In [33]:
# [u for u in usda_linkage_list if '2013' in u['title']]
# title = 'Household Food Security in the United States in 2013'
# [u for u in usda_linkage_list if   u['title'] == title]
# a = search_title(title)

In [None]:
# this_title = scrub_unicode('Improving the Assessment of SNAP Targeting Using Administrative Records')
# test = run_exact_string_search(string = this_title,api_client = api_client)

In [42]:
import os
import hashlib
import datetime

In [43]:
def get_hash (strings, prefix=None, digest_size=10):
    """
    construct a unique identifier from a collection of strings
    """
    m = hashlib.blake2b(digest_size=digest_size)
    
    for elem in sorted(map(lambda x: x.encode("utf-8").lower().strip(), strings)):
        m.update(elem)

    if prefix:
        id = prefix + m.hexdigest()
    else:
        id = m.hexdigest()

    return id

In [46]:
pubs_path = os.path.join('/Users/sophierand/RichContextMetadata/metadata/20190913_usda_excel/results/{}_'.format(get_hash(str(datetime.datetime.now())))+'usda20190913_publications.json')
pubs_path


'/Users/sophierand/RichContextMetadata/metadata/20190913_usda_excel/results/95b25a3ed292789b8353_usda20190913_publications.json'

In [47]:
with open(pubs_path, 'w') as outfile:
    json.dump(usda_linkage_list, outfile,indent=2)
