#### Import CSV containing publication-dataset linkages

Set `linkages_path` to the location of the csv containg dataset-publication linkages and read in csv

In [43]:
import pandas as pd
import os

In [44]:
# linkages_path =  '/Users/sophierand/RichContextMetadata/metadata/20190717_usda_snap/SNAP_DATA_DIMENSIONS_SEARCH_DEMO.csv'
linkages_path =  os.path.join(os.getcwd(),'IRI Sample_sbr.csv')
linkages_csv = pd.read_csv(linkages_path)

Format/clean linkage data - apply `scrub_unicode` to `title` field.

In [45]:
import unicodedata

In [46]:
def scrub_unicode (text):
    """
    try to handle the unicode edge cases encountered in source text,
    as best as possible
    """
    x = " ".join(map(lambda s: s.strip(), text.split("\n"))).strip()

    x = x.replace('“', '"').replace('”', '"')
    x = x.replace("‘", "'").replace("’", "'").replace("`", "'")
    x = x.replace("`` ", '"').replace("''", '"')
    x = x.replace('…', '...').replace("\\u2026", "...")
    x = x.replace("\\u00ae", "").replace("\\u2122", "")
    x = x.replace("\\u00a0", " ").replace("\\u2022", "*").replace("\\u00b7", "*")
    x = x.replace("\\u2018", "'").replace("\\u2019", "'").replace("\\u201a", "'")
    x = x.replace("\\u201c", '"').replace("\\u201d", '"')

    x = x.replace("\\u20ac", "€")
    x = x.replace("\\u2212", " - ") # minus sign

    x = x.replace("\\u00e9", "é")
    x = x.replace("\\u017c", "ż").replace("\\u015b", "ś").replace("\\u0142", "ł")    
    x = x.replace("\\u0105", "ą").replace("\\u0119", "ę").replace("\\u017a", "ź").replace("\\u00f3", "ó")

    x = x.replace("\\u2014", " - ").replace('–', '-').replace('—', ' - ')
    x = x.replace("\\u2013", " - ").replace("\\u00ad", " - ")

    x = str(unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8"))

    # some content returns text in bytes rather than as a str ?
    try:
        assert type(x).__name__ == "str"
    except AssertionError:
        print("not a string?", type(x), x)

    return x

Scrub titles of problematic characters, drop nulls and dedupe

In [47]:
linkages_csv['title'] = linkages_csv['title'].apply(scrub_unicode)
linkages_csv = linkages_csv.loc[pd.notnull(linkages_csv.dataset)].drop_duplicates()
linkages_csv = linkages_csv.loc[pd.notnull(linkages_csv.title)].drop_duplicates()

Update required metadata fields - if your csv has any of the fields listed in optional or required fields, add them to `pub_metadata_fields`. 

In [48]:
# linkages_csv.head()

In [49]:
pub_metadata_fields = ['title','doi','journal']
original_metadata_cols = list(set(linkages_csv.columns.values.tolist()) - set(pub_metadata_fields)-set(['dataset']))

#### Generate list of dicts of metadata

Read in `datasets.json`. Update `datasets_path` to your local.

In [50]:
import json

In [51]:
datasets_path = '/Users/sophierand/RCDatasets/datasets.json'

with open(datasets_path) as json_file:
    datasets = json.load(json_file)

Create list of dictionaries of publication metadata. `format_metadata` iterrates through `linkages_csv` dataframe, splits the `dataset` field (for when multiple datasets are listed); throws an error if the dataset doesn't exist and needs to be added to `datasets.json`.

In [52]:
def create_pub_dict(linkages_dataframe,datasets):
    pub_dict_list = []
    for i, r in linkages_dataframe.iterrows():
        r['title'] = scrub_unicode(r['title'])
        ds_id_list = [d.strip() for d in r['dataset'].split(",")]
        for ds in ds_id_list:
            check_ds = [b for b in datasets if b['id'] == ds]
            if len(check_ds) == 0:
                print('dataset {} isnt listed in datasets.json. Please add to file'.format(ds))
        required_metadata = r[pub_metadata_fields].to_dict()
        required_metadata.update({'datasets':ds_id_list})
        pub_dict = required_metadata
        if len(original_metadata_cols) > 0:
            original_metadata = r[original_metadata_cols].to_dict()
            pub_dict.update({'original':original_metadata})
        pub_dict_list.append(pub_dict)
    return pub_dict_list

Generate publication metadata and export to json

In [53]:
linkage_list = create_pub_dict(linkages_csv,datasets)

Update `pub_path` to be: 
`<name_of_subfolder>_publications.json`

In [54]:
import os
pub_path = '20190726_usda_iri_publications.json'
json_pub_path = os.path.join('/Users/sophierand/RCPublications/partitions/',pub_path)

In [55]:
with open(json_pub_path, 'w') as outfile:
    json.dump(linkage_list, outfile, indent=2)